Compare commits

..

4 Commits

Author SHA1 Message Date
4c42267293 updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-03-28 02:26:20 +00:00
24f68342b4 updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-03-28 02:17:42 +00:00
c5d963835b updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-03-28 01:54:01 +00:00
b313220727 updates
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-27 23:51:36 +00:00
1491 changed files with 42728 additions and 123675 deletions

View File

@ -8,12 +8,12 @@ import zipfile
# Note that we have 400 MiB quota, please use it wisely. # Note that we have 400 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/3792 . # See https://github.com/pypi/support/issues/3792 .
# Please also sync the value with the one in Dockerfile. # Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400)) VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
def print_top_10_largest_files(zip_file): def print_top_10_largest_files(zip_file):
"""Print the top 10 largest files in the given zip file.""" """Print the top 10 largest files in the given zip file."""
with zipfile.ZipFile(zip_file, "r") as z: with zipfile.ZipFile(zip_file, 'r') as z:
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()] file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
file_sizes.sort(key=lambda x: x[1], reverse=True) file_sizes.sort(key=lambda x: x[1], reverse=True)
for f, size in file_sizes[:10]: for f, size in file_sizes[:10]:
@ -28,18 +28,14 @@ def check_wheel_size(directory):
wheel_path = os.path.join(root, file_name) wheel_path = os.path.join(root, file_name)
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024) wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
if wheel_size_mb > VLLM_MAX_SIZE_MB: if wheel_size_mb > VLLM_MAX_SIZE_MB:
print( print(f"Not allowed: Wheel {wheel_path} is larger "
f"Not allowed: Wheel {wheel_path} is larger " f"({wheel_size_mb:.2f} MB) than the limit "
f"({wheel_size_mb:.2f} MB) than the limit " f"({VLLM_MAX_SIZE_MB} MB).")
f"({VLLM_MAX_SIZE_MB} MB)."
)
print_top_10_largest_files(wheel_path) print_top_10_largest_files(wheel_path)
return 1 return 1
else: else:
print( print(f"Wheel {wheel_path} is within the allowed size "
f"Wheel {wheel_path} is within the allowed size " f"({wheel_size_mb:.2f} MB).")
f"({wheel_size_mb:.2f} MB)."
)
return 0 return 0
@ -49,4 +45,4 @@ if __name__ == "__main__":
sys.exit(1) sys.exit(1)
directory = sys.argv[1] directory = sys.argv[1]
sys.exit(check_wheel_size(directory)) sys.exit(check_wheel_size(directory))

View File

@ -22,5 +22,5 @@ with open("index.html", "w") as f:
print(f"Generated index.html for {args.wheel}") print(f"Generated index.html for {args.wheel}")
# cloudfront requires escaping the '+' character # cloudfront requires escaping the '+' character
f.write( f.write(
template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B")) template.format(wheel=filename,
) wheel_html_escaped=filename.replace("+", "%2B")))

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat" model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform" model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
model_name: "meta-llama/Meta-Llama-3-70B-Instruct" model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors" model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform" model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test" model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test" model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test" model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
tasks: tasks:

View File

@ -1,5 +1,4 @@
# For hf script, without -t option (tensor parallel size). # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
model_name: "meta-llama/Meta-Llama-3-8B-Instruct" model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
tasks: tasks:
- name: "gsm8k" - name: "gsm8k"

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
model_name: "HandH1998/QQQ-Llama-3-8b-g128" model_name: "HandH1998/QQQ-Llama-3-8b-g128"
tasks: tasks:

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.335
- name: "exact_match,flexible-extract"
value: 0.323
limit: 1319
num_fewshot: 5

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8" model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
model_name: "mgoin/Minitron-4B-Base-FP8" model_name: "mgoin/Minitron-4B-Base-FP8"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic" model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8" model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
tasks: tasks:

View File

@ -1,5 +1,4 @@
# For hf script, without -t option (tensor parallel size). # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1" model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
tasks: tasks:
- name: "gsm8k" - name: "gsm8k"

View File

@ -1,12 +0,0 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.30
- name: "exact_match,flexible-extract"
value: 0.465
limit: 1319
num_fewshot: 5

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8" model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8" model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise" model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
model_name: "Qwen/Qwen2-57B-A14B-Instruct" model_name: "Qwen/Qwen2-57B-A14B-Instruct"
tasks: tasks:

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.54
- name: "exact_match,flexible-extract"
value: 0.59
limit: 1319
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.47
- name: "exact_match,flexible-extract"
value: 0.64
limit: 1319
num_fewshot: 5

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM" model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
tasks: tasks:

View File

@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
Mixtral-8x7B-Instruct-v0.1.yaml Mixtral-8x7B-Instruct-v0.1.yaml
Qwen2-57B-A14-Instruct.yaml Qwen2-57B-A14-Instruct.yaml
DeepSeek-V2-Lite-Chat.yaml DeepSeek-V2-Lite-Chat.yaml
Meta-Llama-3-8B-QQQ.yaml

View File

@ -1,6 +1,10 @@
Qwen2.5-1.5B-Instruct.yaml Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
Qwen1.5-MoE-W4A16-compressed-tensors.yaml Minitron-4B-Base-FP8.yaml
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
Qwen2-1.5B-Instruct-FP8W8.yaml
Meta-Llama-3-8B-QQQ.yaml

View File

@ -1,43 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path
import pytest
def pytest_addoption(parser):
parser.addoption(
"--config-list-file",
action="store",
help="Path to the file listing model config YAMLs (one per line)",
)
parser.addoption(
"--tp-size",
action="store",
default="1",
help="Tensor parallel size to use for evaluation",
)
@pytest.fixture(scope="session")
def config_list_file(pytestconfig, config_dir):
rel_path = pytestconfig.getoption("--config-list-file")
return config_dir / rel_path
@pytest.fixture(scope="session")
def tp_size(pytestconfig):
return pytestconfig.getoption("--tp-size")
def pytest_generate_tests(metafunc):
if "config_filename" in metafunc.fixturenames:
rel_path = metafunc.config.getoption("--config-list-file")
config_list_file = Path(rel_path).resolve()
config_dir = config_list_file.parent
with open(config_list_file, encoding="utf-8") as f:
configs = [
config_dir / line.strip()
for line in f
if line.strip() and not line.startswith("#")
]
metafunc.parametrize("config_filename", configs)

View File

@ -0,0 +1,59 @@
#!/bin/bash
usage() {
echo``
echo "Runs lm eval harness on GSM8k using vllm and compares to "
echo "precomputed baseline (measured by HF transformers.)"
echo
echo "usage: ${0} <options>"
echo
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
echo " -t - tensor parallel size"
echo
}
SUCCESS=0
while getopts "c:t:" OPT; do
case ${OPT} in
c )
CONFIG="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
# Parse list of configs.
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
do
LOCAL_SUCCESS=0
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
export LM_EVAL_TP_SIZE=$TP_SIZE
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
if [[ $LOCAL_SUCCESS == 0 ]]; then
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
else
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
fi
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
done
if [ "${SUCCESS}" -eq "0" ]; then
exit 0
else
exit 1
fi

View File

@ -3,52 +3,67 @@
LM eval harness on model to compare vs HF baseline computed offline. LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml Configs are found in configs/$MODEL.yaml
pytest -s -v test_lm_eval_correctness.py \ * export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
--config-list-file=configs/models-small.txt \ * export LM_EVAL_TP_SIZE=4
--tp-size=1 * pytest -s test_lm_eval_correctness.py
""" """
import os
from pathlib import Path
import lm_eval import lm_eval
import numpy as np import numpy
import pytest
import yaml import yaml
RTOL = 0.08 RTOL = 0.05
TEST_DATA_FILE = os.environ.get(
"LM_EVAL_TEST_DATA_FILE",
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
def launch_lm_eval(eval_config, tp_size): def launch_lm_eval(eval_config):
trust_remote_code = eval_config.get("trust_remote_code", False) trust_remote_code = eval_config.get('trust_remote_code', False)
model_args = (
f"pretrained={eval_config['model_name']}," model_args = f"pretrained={eval_config['model_name']}," \
f"tensor_parallel_size={tp_size}," f"tensor_parallel_size={TP_SIZE}," \
f"enforce_eager=true," f"add_bos_token=true," \
f"add_bos_token=true," f"trust_remote_code={trust_remote_code}"
f"trust_remote_code={trust_remote_code}"
)
results = lm_eval.simple_evaluate( results = lm_eval.simple_evaluate(
model="vllm", model="vllm",
model_args=model_args, model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]], tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"], num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"], limit=eval_config["limit"],
batch_size="auto", batch_size="auto")
)
return results return results
def test_lm_eval_correctness_param(config_filename, tp_size): def test_lm_eval_correctness():
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8")) eval_config = yaml.safe_load(
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
results = launch_lm_eval(eval_config, tp_size) if eval_config[
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
pytest.skip("FBGEMM is currently failing on main.")
# Launch eval requests.
results = launch_lm_eval(eval_config)
# Confirm scores match ground truth.
success = True success = True
for task in eval_config["tasks"]: for task in eval_config["tasks"]:
for metric in task["metrics"]: for metric in task["metrics"]:
ground_truth = metric["value"] ground_truth = metric["value"]
measured_value = results["results"][task["name"]][metric["name"]] measured_value = results["results"][task["name"]][metric["name"]]
print( print(f'{task["name"]} | {metric["name"]}: '
f"{task['name']} | {metric['name']}: " f'ground_truth={ground_truth} | measured={measured_value}')
f"ground_truth={ground_truth} | measured={measured_value}" success = success and numpy.isclose(
) ground_truth, measured_value, rtol=RTOL)
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
# Assert at the end, print all scores even on failure for debugging.
assert success assert success

View File

@ -65,18 +65,18 @@ def read_markdown(file):
def results_to_json(latency, throughput, serving): def results_to_json(latency, throughput, serving):
return json.dumps( return json.dumps({
{ 'latency': latency.to_dict(),
"latency": latency.to_dict(), 'throughput': throughput.to_dict(),
"throughput": throughput.to_dict(), 'serving': serving.to_dict()
"serving": serving.to_dict(), })
}
)
if __name__ == "__main__": if __name__ == "__main__":
# collect results # collect results
for test_file in results_folder.glob("*.json"): for test_file in results_folder.glob("*.json"):
with open(test_file) as f: with open(test_file) as f:
raw_result = json.loads(f.read()) raw_result = json.loads(f.read())
@ -120,8 +120,7 @@ if __name__ == "__main__":
for perc in [10, 25, 50, 75, 90, 99]: for perc in [10, 25, 50, 75, 90, 99]:
# Multiply 1000 to convert the time unit from s to ms # Multiply 1000 to convert the time unit from s to ms
raw_result.update( raw_result.update(
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]} {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
)
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000 raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
# add the result to raw_result # add the result to raw_result
@ -154,27 +153,26 @@ if __name__ == "__main__":
serving_results = pd.DataFrame.from_dict(serving_results) serving_results = pd.DataFrame.from_dict(serving_results)
throughput_results = pd.DataFrame.from_dict(throughput_results) throughput_results = pd.DataFrame.from_dict(throughput_results)
raw_results_json = results_to_json( raw_results_json = results_to_json(latency_results, throughput_results,
latency_results, throughput_results, serving_results serving_results)
)
# remapping the key, for visualization purpose # remapping the key, for visualization purpose
if not latency_results.empty: if not latency_results.empty:
latency_results = latency_results[list(latency_column_mapping.keys())].rename( latency_results = latency_results[list(
columns=latency_column_mapping latency_column_mapping.keys())].rename(
) columns=latency_column_mapping)
if not serving_results.empty: if not serving_results.empty:
serving_results = serving_results[list(serving_column_mapping.keys())].rename( serving_results = serving_results[list(
columns=serving_column_mapping serving_column_mapping.keys())].rename(
) columns=serving_column_mapping)
if not throughput_results.empty: if not throughput_results.empty:
throughput_results = throughput_results[ throughput_results = throughput_results[list(
list(throughput_results_column_mapping.keys()) throughput_results_column_mapping.keys())].rename(
].rename(columns=throughput_results_column_mapping) columns=throughput_results_column_mapping)
processed_results_json = results_to_json( processed_results_json = results_to_json(latency_results,
latency_results, throughput_results, serving_results throughput_results,
) serving_results)
for df in [latency_results, serving_results, throughput_results]: for df in [latency_results, serving_results, throughput_results]:
if df.empty: if df.empty:
@ -186,39 +184,38 @@ if __name__ == "__main__":
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...", # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
# we want to turn it into "8xGPUTYPE" # we want to turn it into "8xGPUTYPE"
df["GPU"] = df["GPU"].apply( df["GPU"] = df["GPU"].apply(
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}" lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
)
# get markdown tables # get markdown tables
latency_md_table = tabulate( latency_md_table = tabulate(latency_results,
latency_results, headers="keys", tablefmt="pipe", showindex=False headers='keys',
) tablefmt='pipe',
serving_md_table = tabulate( showindex=False)
serving_results, headers="keys", tablefmt="pipe", showindex=False serving_md_table = tabulate(serving_results,
) headers='keys',
throughput_md_table = tabulate( tablefmt='pipe',
throughput_results, headers="keys", tablefmt="pipe", showindex=False showindex=False)
) throughput_md_table = tabulate(throughput_results,
headers='keys',
tablefmt='pipe',
showindex=False)
# document the result # document the result
with open(results_folder / "benchmark_results.md", "w") as f: with open(results_folder / "benchmark_results.md", "w") as f:
results = read_markdown(
"../.buildkite/nightly-benchmarks/" results = read_markdown("../.buildkite/nightly-benchmarks/" +
+ "performance-benchmarks-descriptions.md" "performance-benchmarks-descriptions.md")
)
results = results.format( results = results.format(
latency_tests_markdown_table=latency_md_table, latency_tests_markdown_table=latency_md_table,
throughput_tests_markdown_table=throughput_md_table, throughput_tests_markdown_table=throughput_md_table,
serving_tests_markdown_table=serving_md_table, serving_tests_markdown_table=serving_md_table,
benchmarking_results_in_json_string=processed_results_json, benchmarking_results_in_json_string=processed_results_json)
)
f.write(results) f.write(results)
# document benchmarking results in json # document benchmarking results in json
with open(results_folder / "benchmark_results.json", "w") as f: with open(results_folder / "benchmark_results.json", "w") as f:
results = (
latency_results.to_dict(orient="records") results = latency_results.to_dict(
+ throughput_results.to_dict(orient="records") orient='records') + throughput_results.to_dict(
+ serving_results.to_dict(orient="records") orient='records') + serving_results.to_dict(orient='records')
)
f.write(json.dumps(results)) f.write(json.dumps(results))

View File

@ -14,12 +14,15 @@ def main(model, cachedir):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Download and save Hugging Face tokenizer" description="Download and save Hugging Face tokenizer")
) parser.add_argument("--model",
parser.add_argument("--model", type=str, required=True, help="Name of the model") type=str,
parser.add_argument( required=True,
"--cachedir", type=str, required=True, help="Directory to save the tokenizer" help="Name of the model")
) parser.add_argument("--cachedir",
type=str,
required=True,
help="Directory to save the tokenizer")
args = parser.parse_args() args = parser.parse_args()
main(args.model, args.cachedir) main(args.model, args.cachedir)

View File

@ -11,33 +11,33 @@ from tabulate import tabulate
def parse_arguments(): def parse_arguments():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Parse command line arguments for summary-nightly-results script." description=
) 'Parse command line arguments for summary-nightly-results script.')
parser.add_argument( parser.add_argument('--results-folder',
"--results-folder", type=str,
type=str, required=True,
required=True, help='The folder where the results are stored.')
help="The folder where the results are stored.", parser.add_argument('--description',
) type=str,
parser.add_argument( required=True,
"--description", type=str, required=True, help="Description of the results." help='Description of the results.')
)
args = parser.parse_args() args = parser.parse_args()
return args return args
def get_perf(df, method, model, metric): def get_perf(df, method, model, metric):
means = [] means = []
for qps in [2, 4, 8, 16, "inf"]: for qps in [2, 4, 8, 16, "inf"]:
target = df["Test name"].str.contains(model) target = df['Test name'].str.contains(model)
target = target & df["Engine"].str.contains(method) target = target & df['Engine'].str.contains(method)
target = target & df["Test name"].str.contains("qps_" + str(qps)) target = target & df['Test name'].str.contains("qps_" + str(qps))
filtered_df = df[target] filtered_df = df[target]
if filtered_df.empty: if filtered_df.empty:
means.append(0.0) means.append(0.)
else: else:
means.append(filtered_df[metric].values[0]) means.append(filtered_df[metric].values[0])
@ -45,6 +45,7 @@ def get_perf(df, method, model, metric):
def get_perf_w_std(df, method, model, metric): def get_perf_w_std(df, method, model, metric):
if metric in ["TTFT", "ITL"]: if metric in ["TTFT", "ITL"]:
mean = get_perf(df, method, model, "Mean " + metric + " (ms)") mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
mean = mean.tolist() mean = mean.tolist()
@ -59,8 +60,7 @@ def get_perf_w_std(df, method, model, metric):
else: else:
assert metric == "Tput" assert metric == "Tput"
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf( mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
df, method, model, "Output Tput (tok/s)" df, method, model, "Output Tput (tok/s)")
)
mean = mean.tolist() mean = mean.tolist()
std = None std = None
@ -80,17 +80,18 @@ def main(args):
# generate markdown table # generate markdown table
df = pd.DataFrame.from_dict(results) df = pd.DataFrame.from_dict(results)
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False) md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
with open(args.description) as f: with open(args.description) as f:
description = f.read() description = f.read()
description = description.format(nightly_results_benchmarking_table=md_table) description = description.format(
nightly_results_benchmarking_table=md_table)
with open("nightly_results.md", "w") as f: with open("nightly_results.md", "w") as f:
f.write(description) f.write(description)
if __name__ == "__main__": if __name__ == '__main__':
args = parse_arguments() args = parse_arguments()
main(args) main(args)

View File

@ -10,24 +10,15 @@ set -x
set -o pipefail set -o pipefail
check_gpus() { check_gpus() {
if command -v nvidia-smi; then # check the number of GPUs and GPU type.
# check the number of GPUs and GPU type. declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
elif command -v amd-smi; then
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
fi
if [[ $gpu_count -gt 0 ]]; then if [[ $gpu_count -gt 0 ]]; then
echo "GPU found." echo "GPU found."
else else
echo "Need at least 1 GPU to run benchmarking." echo "Need at least 1 GPU to run benchmarking."
exit 1 exit 1
fi fi
if command -v nvidia-smi; then declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
elif command -v amd-smi; then
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
fi
echo "GPU type is $gpu_type" echo "GPU type is $gpu_type"
} }
@ -99,15 +90,9 @@ kill_gpu_processes() {
# wait until GPU memory usage smaller than 1GB # wait until GPU memory usage smaller than 1GB
if command -v nvidia-smi; then while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do sleep 1
sleep 1 done
done
elif command -v amd-smi; then
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
sleep 1
done
fi
# remove vllm config file # remove vllm config file
rm -rf ~/.config/vllm rm -rf ~/.config/vllm

View File

@ -34,8 +34,10 @@ serving_column_mapping = {
} }
if __name__ == "__main__": if __name__ == "__main__":
# collect results # collect results
for test_file in results_folder.glob("*.json"): for test_file in results_folder.glob("*.json"):
with open(test_file) as f: with open(test_file) as f:
raw_result = json.loads(f.read()) raw_result = json.loads(f.read())
@ -54,16 +56,17 @@ if __name__ == "__main__":
serving_results = pd.DataFrame.from_dict(serving_results) serving_results = pd.DataFrame.from_dict(serving_results)
if not serving_results.empty: if not serving_results.empty:
serving_results = serving_results[list(serving_column_mapping.keys())].rename( serving_results = serving_results[list(
columns=serving_column_mapping serving_column_mapping.keys())].rename(
) columns=serving_column_mapping)
serving_md_table_with_headers = tabulate( serving_md_table_with_headers = tabulate(serving_results,
serving_results, headers="keys", tablefmt="pipe", showindex=False headers='keys',
) tablefmt='pipe',
showindex=False)
# remove the first line of header # remove the first line of header
serving_md_table_lines = serving_md_table_with_headers.split("\n") serving_md_table_lines = serving_md_table_with_headers.split('\n')
serving_md_table_without_header = "\n".join(serving_md_table_lines[2:]) serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE") prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
@ -73,9 +76,10 @@ if __name__ == "__main__":
# document results with header. # document results with header.
# for those who wants to reproduce our benchmark. # for those who wants to reproduce our benchmark.
f.write(serving_md_table_with_headers) f.write(serving_md_table_with_headers)
f.write("\n") f.write('\n')
# document benchmarking results in json # document benchmarking results in json
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f: with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
results = serving_results.to_dict(orient="records")
results = serving_results.to_dict(orient='records')
f.write(json.dumps(results)) f.write(json.dumps(results))

View File

@ -63,12 +63,10 @@
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"disable_log_requests": "", "disable_log_requests": "",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"swap_space": 16, "swap_space": 16,
"speculative_config": { "speculative_model": "turboderp/Qwama-0.5B-Instruct",
"model": "turboderp/Qwama-0.5B-Instruct", "num_speculative_tokens": 4,
"num_speculative_tokens": 4, "speculative_draft_tensor_parallel_size": 1
"draft_tensor_parallel_size": 1
}
}, },
"client_parameters": { "client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",

View File

@ -1,51 +0,0 @@
# This local pyproject file is part of the migration from yapf to ruff format.
# It uses the same core rules as the main pyproject.toml file, but with the
# following differences:
# - ruff line length is overridden to 88
# - deprecated typing ignores (UP006, UP035) have been removed
[tool.ruff]
line-length = 88
exclude = [
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
[tool.ruff.lint]
select = [
# pycodestyle
"E",
# Pyflakes
"F",
# pyupgrade
"UP",
# flake8-bugbear
"B",
# flake8-simplify
"SIM",
# isort
"I",
# flake8-logging-format
"G",
]
ignore = [
# star imports
"F405", "F403",
# lambda expression assignment
"E731",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
]
[tool.ruff.format]
docstring-code-format = true

View File

@ -1,23 +1,23 @@
steps: steps:
- label: "Build wheel - CUDA 12.8" - label: "Build wheel - CUDA 12.4"
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/upload-wheels.sh"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.6" - label: "Build wheel - CUDA 12.1"
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/upload-wheels.sh"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
@ -31,10 +31,10 @@ steps:
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/upload-wheels.sh"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
@ -48,7 +48,7 @@ steps:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build and publish TPU release image" - label: "Build and publish TPU release image"
@ -57,9 +57,7 @@ steps:
agents: agents:
queue: tpu_queue_postmerge queue: tpu_queue_postmerge
commands: commands:
- "yes | docker system prune -a" - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
- "git fetch --all"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
- "docker push vllm/vllm-tpu:nightly" - "docker push vllm/vllm-tpu:nightly"
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT" - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
plugins: plugins:
@ -84,22 +82,7 @@ steps:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- block: "Build Neuron release image"
key: block-neuron-release-image-build
depends_on: ~
- label: "Build and publish Neuron release image"
depends_on: block-neuron-release-image-build
agents:
queue: neuron-postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"

View File

@ -3,9 +3,6 @@
# This script runs test inside the corresponding ROCm docker container. # This script runs test inside the corresponding ROCm docker container.
set -o pipefail set -o pipefail
# Export Python path
export PYTHONPATH=".."
# Print ROCm version # Print ROCm version
echo "--- Confirming Clean Initial State" echo "--- Confirming Clean Initial State"
while true; do while true; do
@ -77,102 +74,50 @@ HF_MOUNT="/root/.cache/huggingface"
commands=$@ commands=$@
echo "Commands:$commands" echo "Commands:$commands"
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
fi
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
fi
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi
#ignore certain kernels tests #ignore certain kernels tests
if [[ $commands == *" kernels/core"* ]]; then if [[ $commands == *" kernels "* ]]; then
commands="${commands} \ commands="${commands} \
--ignore=kernels/core/test_fused_quant_layernorm.py \ --ignore=kernels/test_attention_selector.py \
--ignore=kernels/core/test_permute_cols.py" --ignore=kernels/test_blocksparse_attention.py \
fi --ignore=kernels/test_causal_conv1d.py \
--ignore=kernels/test_cutlass.py \
if [[ $commands == *" kernels/attention"* ]]; then --ignore=kernels/test_encoder_decoder_attn.py \
commands="${commands} \ --ignore=kernels/test_flash_attn.py \
--ignore=kernels/attention/stest_attention_selector.py \ --ignore=kernels/test_flashinfer.py \
--ignore=kernels/attention/test_blocksparse_attention.py \ --ignore=kernels/test_int8_quant.py \
--ignore=kernels/attention/test_encoder_decoder_attn.py \ --ignore=kernels/test_machete_gemm.py \
--ignore=kernels/attention/test_attention_selector.py \ --ignore=kernels/test_mamba_ssm.py \
--ignore=kernels/attention/test_flash_attn.py \ --ignore=kernels/test_marlin_gemm.py \
--ignore=kernels/attention/test_flashinfer.py \ --ignore=kernels/test_moe.py \
--ignore=kernels/attention/test_prefix_prefill.py \ --ignore=kernels/test_prefix_prefill.py \
--ignore=kernels/attention/test_cascade_flash_attn.py \ --ignore=kernels/test_rand.py \
--ignore=kernels/attention/test_mha_attn.py \ --ignore=kernels/test_sampler.py \
--ignore=kernels/attention/test_lightning_attn.py \ --ignore=kernels/test_cascade_flash_attn.py \
--ignore=kernels/attention/test_attention.py" --ignore=kernels/test_mamba_mixer2.py \
fi --ignore=kernels/test_aqlm.py \
--ignore=kernels/test_machete_mm.py \
if [[ $commands == *" kernels/quantization"* ]]; then --ignore=kernels/test_mha_attn.py \
commands="${commands} \ --ignore=kernels/test_block_fp8.py \
--ignore=kernels/quantization/test_int8_quant.py \ --ignore=kernels/test_permute_cols.py"
--ignore=kernels/quantization/test_aqlm.py \
--ignore=kernels/quantization/test_machete_mm.py \
--ignore=kernels/quantization/test_block_fp8.py \
--ignore=kernels/quantization/test_block_int8.py \
--ignore=kernels/quantization/test_marlin_gemm.py \
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
--ignore=kernels/quantization/test_int8_kernel.py"
fi
if [[ $commands == *" kernels/mamba"* ]]; then
commands="${commands} \
--ignore=kernels/mamba/test_mamba_mixer2.py \
--ignore=kernels/mamba/test_causal_conv1d.py \
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
fi
if [[ $commands == *" kernels/moe"* ]]; then
commands="${commands} \
--ignore=kernels/moe/test_moe.py \
--ignore=kernels/moe/test_cutlass_moe.py \
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
fi fi
#ignore certain Entrypoints/openai tests #ignore certain Entrypoints/openai tests
if [[ $commands == *" entrypoints/openai "* ]]; then if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \ commands=${commands//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_audio.py \ --ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_chat.py \
--ignore=entrypoints/openai/test_shutdown.py \ --ignore=entrypoints/openai/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \ --ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_sleep.py \ --ignore=entrypoints/openai/test_sleep.py \
--ignore=entrypoints/openai/test_models.py \ --ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--ignore=entrypoints/openai/test_root_path.py \
--ignore=entrypoints/openai/test_tokenization.py \
--ignore=entrypoints/openai/test_prompt_validation.py "} --ignore=entrypoints/openai/test_prompt_validation.py "}
fi fi
#ignore certain Entrypoints/llm tests #ignore certain Entrypoints/llm tests
if [[ $commands == *" entrypoints/llm "* ]]; then if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
commands=${commands//" entrypoints/llm "/" entrypoints/llm \ commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi fi
#Obsolete currently
##ignore certain Entrypoints/llm tests
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
#fi
# --ignore=entrypoints/openai/test_encoder_decoder.py \ # --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \ # --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py # --ignore=entrypoints/openai/test_oot_registration.py
@ -181,8 +126,6 @@ fi
PARALLEL_JOB_COUNT=8 PARALLEL_JOB_COUNT=8
MYPYTHONPATH=".."
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then if [[ $commands == *"--shard-id="* ]]; then
# assign job count as the number of shards used # assign job count as the number of shards used
@ -203,7 +146,6 @@ if [[ $commands == *"--shard-id="* ]]; then
-e AWS_SECRET_ACCESS_KEY \ -e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \ -v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \ -e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}_${GPU}" \ --name "${container_name}_${GPU}" \
"${image_name}" \ "${image_name}" \
/bin/bash -c "${commands_gpu}" \ /bin/bash -c "${commands_gpu}" \
@ -234,7 +176,6 @@ else
-e AWS_SECRET_ACCESS_KEY \ -e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \ -v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \ -e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}" \ --name "${container_name}" \
"${image_name}" \ "${image_name}" \
/bin/bash -c "${commands}" /bin/bash -c "${commands}"

View File

@ -5,8 +5,8 @@
set -ex set -ex
set -o pipefail set -o pipefail
# cd 2 levels into the working directory # cd into parent directory of this file
cd "$(dirname "${BASH_SOURCE[0]}")/../.." cd "$(dirname "${BASH_SOURCE[0]}")/.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)

View File

@ -10,4 +10,5 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Try building the docker image # Try building the docker image
docker build -t cpu-test -f docker/Dockerfile.s390x . docker build -t cpu-test -f Dockerfile.ppc64le .

View File

@ -8,19 +8,15 @@ set -ex
CORE_RANGE=${CORE_RANGE:-48-95} CORE_RANGE=${CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1} NUMA_NODE=${NUMA_NODE:-1}
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
set -e;
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
}
trap remove_docker_container EXIT trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel. # Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
@ -40,6 +36,8 @@ function cpu_tests() {
# Run basic model test # Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
pip install -r vllm/requirements/test.txt
pip install -r vllm/requirements/cpu.txt
pytest -v -s tests/kernels/test_cache.py -m cpu_model pytest -v -s tests/kernels/test_cache.py -m cpu_model
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
pytest -v -s tests/models/decoder_only/language -m cpu_model pytest -v -s tests/models/decoder_only/language -m cpu_model

View File

@ -9,7 +9,6 @@ python3 use_existing_torch.py
# Try building the docker image # Try building the docker image
DOCKER_BUILDKIT=1 docker build . \ DOCKER_BUILDKIT=1 docker build . \
--file docker/Dockerfile \
--target vllm-openai \ --target vllm-openai \
--platform "linux/arm64" \ --platform "linux/arm64" \
-t gh200-test \ -t gh200-test \

View File

@ -5,7 +5,7 @@
set -ex set -ex
# Try building the docker image # Try building the docker image
docker build -t hpu-test-env -f docker/Dockerfile.hpu . docker build -t hpu-test-env -f Dockerfile.hpu .
# Setup cleanup # Setup cleanup
# certain versions of HPU software stack have a bug that can # certain versions of HPU software stack have a bug that can

View File

@ -3,7 +3,7 @@
set -euox pipefail set -euox pipefail
if [[ $# -lt 4 ]]; then if [[ $# -lt 4 ]]; then
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN" echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
exit 1 exit 1
fi fi

View File

@ -35,7 +35,7 @@ else
date "+%s" > /tmp/neuron-docker-build-timestamp date "+%s" > /tmp/neuron-docker-build-timestamp
fi fi
docker build -t "${image_name}" -f docker/Dockerfile.neuron . docker build -t "${image_name}" -f Dockerfile.neuron .
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {

42
.buildkite/run-tpu-v1-test.sh Executable file
View File

@ -0,0 +1,42 @@
#!/bin/bash
set -e
# Build the docker image.
docker build -f Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
remove_docker_container() { docker rm -f tpu-test || true; }
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# For HF_TOKEN.
source /etc/environment
# Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
&& python3 -m pip install pytest \
&& python3 -m pip install lm_eval[api]==0.4.4 \
&& export VLLM_USE_V1=1 \
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
&& echo TEST_1 \
&& pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
&& echo TEST_2 \
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
&& echo TEST_3 \
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
&& echo TEST_4 \
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
&& echo TEST_5 \
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \
&& echo TEST_6 \
&& pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py \
&& echo TEST_7 \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
# TODO: This test fails because it uses RANDOM_SEED sampling
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# Try building the docker image # Try building the docker image
docker build -t ${image_name} -f docker/Dockerfile.xpu . docker build -t ${image_name} -f Dockerfile.xpu .
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {

View File

@ -1,48 +0,0 @@
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Setup cleanup
remove_docker_container() {
if [[ -n "$container_id" ]]; then
podman rm -f "$container_id" || true
fi
podman system prune -f
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
# Run the image
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
function cpu_tests() {
# offline inference
podman exec -it "$container_id" bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test
podman exec -it "$container_id" bash -c "
set -e
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
pip install sentence-transformers datamodel_code_generator
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
}
# All of CPU tests are expected to be finished less than 40 mins.
export container_id
export -f cpu_tests
timeout 40m bash -c cpu_tests

View File

@ -1,103 +0,0 @@
#!/bin/bash
set -xu
# Build the docker image.
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
remove_docker_container() { docker rm -f tpu-test || true; }
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# For HF_TOKEN.
source /etc/environment
# Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
&& python3 -m pip install pytest pytest-asyncio tpu-info \
&& python3 -m pip install lm_eval[api]==0.4.4 \
&& export VLLM_XLA_CACHE_PATH= \
&& export VLLM_USE_V1=1 \
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
&& echo HARDWARE \
&& tpu-info \
&& { \
echo TEST_0: Running test_perf.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
echo TEST_0_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_1: Running test_compilation.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
echo TEST_1_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_2: Running test_basic.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
echo TEST_2_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
echo TEST_3_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_4: Running test_quantization_accuracy.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
echo TEST_4_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_5: Running examples/offline_inference/tpu.py; \
python3 /workspace/vllm/examples/offline_inference/tpu.py; \
echo TEST_5_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_6: Running test_tpu_model_runner.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
echo TEST_6_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_7: Running test_sampler.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
echo TEST_7_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_8: Running test_topk_topp_sampler.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
echo TEST_8_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_9: Running test_multimodal.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
echo TEST_9_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_10: Running test_pallas.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
echo TEST_10_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_11: Running test_struct_output_generate.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
echo TEST_11_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_12: Running test_moe_pallas.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
echo TEST_12_EXIT_CODE: \$?; \
} & \
# Disable the TPU LoRA tests until the feature is activated
# & { \
# echo TEST_13: Running test_moe_pallas.py; \
# python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
# echo TEST_13_EXIT_CODE: \$?; \
# } & \
wait \
&& echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
"
# TODO: This test fails because it uses RANDOM_SEED sampling
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@ -8,7 +8,6 @@
# Documentation # Documentation
# label(str): the name of the test. emoji allowed. # label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline. # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only # fast_check_only(bool): run this test on fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run. # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
# command(str): the single command to run for tests. incompatible with commands. # command(str): the single command to run for tests. incompatible with commands.
@ -32,7 +31,6 @@ steps:
##### fast check tests ##### ##### fast check tests #####
- label: Documentation Build # 2min - label: Documentation Build # 2min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/test_docs/docs" working_dir: "/vllm-workspace/test_docs/docs"
fast_check: true fast_check: true
no_gpu: True no_gpu: True
@ -40,10 +38,9 @@ steps:
- pip install -r ../../requirements/docs.txt - pip install -r ../../requirements/docs.txt
- SPHINXOPTS=\"-W\" make html - SPHINXOPTS=\"-W\" make html
# Check API reference (if it fails, you may have missing mock imports) # Check API reference (if it fails, you may have missing mock imports)
- grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html - grep \"sig sig-object py\" build/html/api/inference_params.html
- label: Async Engine, Inputs, Utils, Worker Test # 24min - label: Async Engine, Inputs, Utils, Worker Test # 24min
mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/mq_llm_engine - tests/mq_llm_engine
@ -64,7 +61,6 @@ steps:
- pytest -v -s worker # Worker - pytest -v -s worker # Worker
- label: Python-only Installation Test - label: Python-only Installation Test
mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- tests/standalone_tests/python_only_compile.sh - tests/standalone_tests/python_only_compile.sh
- setup.py - setup.py
@ -72,9 +68,8 @@ steps:
- bash standalone_tests/python_only_compile.sh - bash standalone_tests/python_only_compile.sh
- label: Basic Correctness Test # 30min - label: Basic Correctness Test # 30min
mirror_hardwares: [amdexperimental, amdproduction] #mirror_hardwares: [amd]
fast_check: true fast_check: true
torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/basic_correctness/test_basic_correctness - tests/basic_correctness/test_basic_correctness
@ -89,7 +84,6 @@ steps:
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Chunked Prefill Test - label: Chunked Prefill Test
mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/basic_correctness/test_chunked_prefill - tests/basic_correctness/test_chunked_prefill
@ -98,7 +92,7 @@ steps:
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- label: Core Test # 10min - label: Core Test # 10min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amd]
fast_check: true fast_check: true
source_file_dependencies: source_file_dependencies:
- vllm/core - vllm/core
@ -108,10 +102,9 @@ steps:
- pytest -v -s core - pytest -v -s core
- label: Entrypoints Test # 40min - label: Entrypoints Test # 40min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
fast_check: true fast_check: true
torch_nightly: true mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/entrypoints/llm - tests/entrypoints/llm
@ -125,12 +118,11 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Distributed Tests (4 GPUs) # 10min - label: Distributed Tests (4 GPUs) # 10min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
@ -148,8 +140,6 @@ steps:
# test with tp=2 and external_dp=2 # test with tp=2 and external_dp=2
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with internal dp # test with internal dp
- python3 ../examples/offline_inference/data_parallel.py - python3 ../examples/offline_inference/data_parallel.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
@ -160,12 +150,11 @@ steps:
# TODO: create a dedicated test section for multi-GPU example tests # TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests # when we have multiple distributed example tests
- pushd ../examples/offline_inference - pushd ../examples/offline_inference
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd - popd
- label: Metrics, Tracing Test # 10min - label: Metrics, Tracing Test # 10min
mirror_hardwares: [amdexperimental, amdproduction]
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -173,13 +162,18 @@ steps:
- tests/tracing - tests/tracing
commands: commands:
- pytest -v -s metrics - pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0,<1.27.0' \
'opentelemetry-api>=1.26.0,<1.27.0' \
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
- pytest -v -s tracing - pytest -v -s tracing
##### fast check tests ##### ##### fast check tests #####
##### 1 GPU test ##### ##### 1 GPU test #####
- label: Regression Test # 5min - label: Regression Test # 5min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/test_regression - tests/test_regression
@ -189,7 +183,7 @@ steps:
working_dir: "/vllm-workspace/tests" # optional working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 10min - label: Engine Test # 10min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/engine - tests/engine
@ -203,21 +197,20 @@ steps:
- pytest -v -s tokenization - pytest -v -s tokenization
- label: V1 Test - label: V1 Test
mirror_hardwares: [amdexperimental] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/v1 - tests/v1
commands: commands:
# split the test to avoid interference # split the test to avoid interference
- pytest -v -s v1/core - pytest -v -s v1/core
- pytest -v -s v1/entrypoints
- pytest -v -s v1/engine - pytest -v -s v1/engine
- pytest -v -s v1/entrypoints - pytest -v -s v1/entrypoints
- pytest -v -s v1/sample - pytest -v -s v1/sample
- pytest -v -s v1/worker - pytest -v -s v1/worker
- pytest -v -s v1/structured_output - pytest -v -s v1/structured_output
- pytest -v -s v1/spec_decode - pytest -v -s v1/test_stats.py
- pytest -v -s v1/kv_connector/unit
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s v1/test_utils.py - pytest -v -s v1/test_utils.py
- pytest -v -s v1/test_oracle.py - pytest -v -s v1/test_oracle.py
# TODO: accuracy does not match, whether setting # TODO: accuracy does not match, whether setting
@ -228,8 +221,8 @@ steps:
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: Examples Test # 25min - label: Examples Test # 25min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/examples" working_dir: "/vllm-workspace/examples"
#mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/entrypoints - vllm/entrypoints
- examples/ - examples/
@ -253,7 +246,7 @@ steps:
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 9min - label: Prefix Caching Test # 9min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/prefix_caching - tests/prefix_caching
@ -261,7 +254,6 @@ steps:
- pytest -v -s prefix_caching - pytest -v -s prefix_caching
- label: Samplers Test # 36min - label: Samplers Test # 36min
mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/layers - vllm/model_executor/layers
- vllm/sampling_metadata.py - vllm/sampling_metadata.py
@ -272,7 +264,7 @@ steps:
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
- label: LogitsProcessor Test # 5min - label: LogitsProcessor Test # 5min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/layers - vllm/model_executor/layers
- vllm/model_executor/guided_decoding - vllm/model_executor/guided_decoding
@ -283,7 +275,6 @@ steps:
- pytest -v -s model_executor/test_guided_processors.py - pytest -v -s model_executor/test_guided_processors.py
- label: Speculative decoding tests # 40min - label: Speculative decoding tests # 40min
mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/spec_decode - vllm/spec_decode
- tests/spec_decode - tests/spec_decode
@ -294,28 +285,14 @@ steps:
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
- label: LoRA Test %N # 15min each - label: LoRA Test %N # 15min each
mirror_hardwares: [amdexperimental] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
- tests/lora - tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
parallelism: 4 parallelism: 4
- label: PyTorch Compilation Unit Tests
mirror_hardwares: [amdexperimental, amdproduction]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py
- pytest -v -s compile/test_silu_mul_quant_fusion.py
- pytest -v -s compile/test_sequence_parallelism.py
- label: PyTorch Fullgraph Smoke Test # 9min - label: PyTorch Fullgraph Smoke Test # 9min
mirror_hardwares: [amdexperimental, amdproduction]
torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/compile - tests/compile
@ -324,64 +301,27 @@ steps:
# these tests need to be separated, cannot combine # these tests need to be separated, cannot combine
- pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py - pytest -v -s compile/piecewise/test_toy_llama.py
- pytest -v -s compile/test_pass_manager.py
- label: PyTorch Fullgraph Test # 18min - label: PyTorch Fullgraph Test # 18min
mirror_hardwares: [amdexperimental, amdproduction]
torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/compile - tests/compile
commands: commands:
- pytest -v -s compile/test_full_graph.py - pytest -v -s compile/test_full_graph.py
- label: Kernels Core Operation Test - label: Kernels Test %N # 1h each
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- tests/kernels/core
commands:
- pytest -v -s kernels/core
- label: Kernels Attention Test %N
mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies:
- csrc/attention/
- vllm/attention - vllm/attention
- vllm/v1/attention - tests/kernels
- tests/kernels/attention
commands: commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2 parallelism: 4
- label: Kernels Quantization Test %N
mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/layers/quantization
- tests/kernels/quantization
commands:
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels MoE Test
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/moe/
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
commands:
- pytest -v -s kernels/moe
- label: Kernels Mamba Test
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/mamba/
- tests/kernels/mamba
commands:
- pytest -v -s kernels/mamba
- label: Tensorizer Test # 11min - label: Tensorizer Test # 11min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amd]
soft_fail: true soft_fail: true
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/model_loader - vllm/model_executor/model_loader
@ -392,42 +332,30 @@ steps:
- pytest -v -s tensorizer_loader - pytest -v -s tensorizer_loader
- label: Benchmarks # 9min - label: Benchmarks # 9min
mirror_hardwares: [amdexperimental, amdproduction]
working_dir: "/vllm-workspace/.buildkite" working_dir: "/vllm-workspace/.buildkite"
mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- benchmarks/ - benchmarks/
commands: commands:
- bash scripts/run-benchmarks.sh - bash run-benchmarks.sh
- label: Benchmarks CLI Test # 10min - label: Quantization Test # 33min
mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies:
- vllm/
- tests/benchmarks/
commands:
- pytest -v -s benchmarks/
- label: Quantization Test
mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
- tests/quantization - tests/quantization
commands: command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - bash ./run-tests.sh -c configs/models-small.txt -t 1
- label: OpenAI API correctness - label: OpenAI API correctness
mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/entrypoints/openai/ - vllm/entrypoints/openai/
@ -436,7 +364,6 @@ steps:
- pytest -s entrypoints/openai/correctness/ - pytest -s entrypoints/openai/correctness/
- label: Encoder Decoder tests # 5min - label: Encoder Decoder tests # 5min
mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/encoder_decoder - tests/encoder_decoder
@ -444,111 +371,98 @@ steps:
- pytest -v -s encoder_decoder - pytest -v -s encoder_decoder
- label: OpenAI-Compatible Tool Use # 20 min - label: OpenAI-Compatible Tool Use # 20 min
mirror_hardwares: [amdexperimental]
fast_check: false fast_check: false
mirror_hardwares: [ amd ]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/tool_use - tests/tool_use
- tests/mistral_tool_use
commands: commands:
- pytest -v -s tool_use - pytest -v -s tool_use
- pytest -v -s mistral_tool_use
##### models test ##### ##### models test #####
- label: Basic Models Test # 24min - label: Basic Models Test # 24min
mirror_hardwares: [amdexperimental, amdproduction]
torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models - tests/models
commands: commands:
- pytest -v -s models/test_transformers.py - pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py - pytest -v -s models/test_registry.py
- pytest -v -s models/test_utils.py
- pytest -v -s models/test_vision.py
# V1 Test: https://github.com/vllm-project/vllm/issues/14531 # V1 Test: https://github.com/vllm-project/vllm/issues/14531
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2' - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
- label: Language Models Test (Standard) - label: Language Models Test (Standard) # 32min
mirror_hardwares: [amdexperimental] #mirror_hardwares: [amd]
torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/language - tests/models/decoder_only/language
- tests/models/embedding/language
- tests/models/encoder_decoder/language
commands: commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - pytest -v -s models/embedding/language -m core_model
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m core_model
- label: Language Models Test (Extended) - label: Language Models Test (Extended) # 1h10min
mirror_hardwares: [amdexperimental]
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/language - tests/models/decoder_only/language
- tests/models/embedding/language
- tests/models/encoder_decoder/language
commands: commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - pytest -v -s models/embedding/language -m 'not core_model'
- pytest -v -s models/language -m 'not core_model'
- label: Multi-Modal Models Test (Standard) - label: Multi-Modal Models Test (Standard) # 40min
mirror_hardwares: [amdexperimental] #mirror_hardwares: [amd]
torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/audio_language
- tests/models/encoder_decoder/vision_language
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch' - pytest -v -s models/multimodal
- pytest -v -s models/multimodal/processing - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - pytest -v -s models/embedding/vision_language -m core_model
- pytest -v -s models/encoder_decoder/audio_language -m core_model
- pytest -v -s models/encoder_decoder/language -m core_model
- pytest -v -s models/encoder_decoder/vision_language -m core_model
- label: Multi-Modal Models Test (Extended) 1 - label: Multi-Modal Models Test (Extended) 1 # 48m
mirror_hardwares: [amdexperimental]
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/vision_language
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model' - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
# HACK - run phi3v tests separately to sidestep this transformers bug
# https://github.com/huggingface/transformers/issues/34307
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
- pytest -v -s models/embedding/vision_language -m 'not core_model'
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
- label: Multi-Modal Models Test (Extended) 2 - label: Multi-Modal Models Test (Extended) 2 # 38m
mirror_hardwares: [amdexperimental]
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/decoder_only/vision_language
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
- label: Multi-Modal Models Test (Extended) 3
mirror_hardwares: [amdexperimental, amdproduction]
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
- label: Quantized Models Test
mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies:
- vllm/model_executor/layers/quantization
- tests/models/quantization
commands:
- pytest -v -s models/quantization
# This test is used only in PR development phase to test individual models and should never run on main # This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test - label: Custom Models Test
mirror_hardwares: [amdexperimental, amdproduction]
optional: true optional: true
commands: commands:
- echo 'Testing custom models...' - echo 'Testing custom models...'
@ -560,7 +474,6 @@ steps:
##### multi gpus test ##### ##### multi gpus test #####
- label: Distributed Comm Ops Test # 7min - label: Distributed Comm Ops Test # 7min
mirror_hardwares: [amdexperimental, amdproduction]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
@ -571,7 +484,6 @@ steps:
- pytest -v -s distributed/test_shm_broadcast.py - pytest -v -s distributed/test_shm_broadcast.py
- label: 2 Node Tests (4 GPUs in total) # 16min - label: 2 Node Tests (4 GPUs in total) # 16min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
num_nodes: 2 num_nodes: 2
@ -590,7 +502,7 @@ steps:
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- label: Distributed Tests (2 GPUs) # 40min - label: Distributed Tests (2 GPUs) # 40min
mirror_hardwares: [amdexperimental] #mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
@ -608,33 +520,30 @@ steps:
- vllm/v1/engine/ - vllm/v1/engine/
commands: commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py - VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py - pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
# Avoid importing model tests that cause CUDA reinitialization error # Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/language -v -s -m 'distributed(num_gpus=2)' - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
# test sequence parallel - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s distributed/test_sequence_parallel.py
# this test fails consistently. # this test fails consistently.
# TODO: investigate and fix # TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- label: Plugin Tests (2 GPUs) # 40min - label: Plugin Tests (2 GPUs) # 40min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
- vllm/plugins/ - vllm/plugins/
- tests/plugins/ - tests/plugins/
commands: commands:
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform # begin platform plugin tests, all the code in-between runs on dummy platform
- pip install -e ./plugins/vllm_add_dummy_platform - pip install -e ./plugins/vllm_add_dummy_platform
- pytest -v -s plugins_tests/test_platform_plugins.py - pytest -v -s plugins_tests/test_platform_plugins.py
- pip uninstall vllm_add_dummy_platform -y - pip uninstall vllm_add_dummy_platform -y
@ -645,10 +554,8 @@ steps:
- pytest -v -s distributed/test_distributed_oot.py - pytest -v -s distributed/test_distributed_oot.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
- label: Multi-step Tests (4 GPUs) # 36min - label: Multi-step Tests (4 GPUs) # 36min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
@ -669,7 +576,6 @@ steps:
- pytest -v -s multi_step/test_correctness_llm.py - pytest -v -s multi_step/test_correctness_llm.py
- label: Pipeline Parallelism Test # 45min - label: Pipeline Parallelism Test # 45min
mirror_hardwares: [amdexperimental, amdproduction]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
@ -683,7 +589,6 @@ steps:
- pytest -v -s distributed/test_pipeline_parallel.py - pytest -v -s distributed/test_pipeline_parallel.py
- label: LoRA TP Test (Distributed) - label: LoRA TP Test (Distributed)
mirror_hardwares: [amdexperimental, amdproduction]
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
@ -696,10 +601,11 @@ steps:
# requires multi-GPU testing for validation. # requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_minicpmv_tp.py
- pytest -v -s -x lora/test_transfomers_model.py
- label: Weight Loading Multiple GPU Test # 33min - label: Weight Loading Multiple GPU Test # 33min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
@ -709,7 +615,6 @@ steps:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
- label: Weight Loading Multiple GPU Test - Large Models # optional - label: Weight Loading Multiple GPU Test - Large Models # optional
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
gpu: a100 gpu: a100
@ -748,4 +653,4 @@ steps:
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - bash ./run-tests.sh -c configs/models-large.txt -t 4

View File

@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
if [[ $normal_wheel == *"cu118"* ]]; then if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html # if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels" echo "Skipping index files for cu118 wheels"
elif [[ $normal_wheel == *"cu126"* ]]; then elif [[ $normal_wheel == *"cu121"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html # if $normal_wheel matches cu121, do not upload the index.html
echo "Skipping index files for cu126 wheels" echo "Skipping index files for cu121 wheels"
else else
# only upload index.html for cu128 wheels (default wheels) # only upload index.html for cu124 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
fi fi
@ -66,13 +66,12 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
if [[ $normal_wheel == *"cu118"* ]]; then if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html # if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels" echo "Skipping index files for cu118 wheels"
elif [[ $normal_wheel == *"cu126"* ]]; then elif [[ $normal_wheel == *"cu121"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html # if $normal_wheel matches cu121, do not upload the index.html
echo "Skipping index files for cu126 wheels" echo "Skipping index files for cu121 wheels"
else else
# only upload index.html for cu128 wheels (default wheels) # only upload index.html for cu124 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
fi fi
aws s3 cp "$wheel" "s3://vllm-wheels/$version/" aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"

1
.github/CODEOWNERS vendored
View File

@ -12,7 +12,6 @@
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
/vllm/model_executor/guided_decoding @mgoin @russellb /vllm/model_executor/guided_decoding @mgoin @russellb
/vllm/multimodal @DarkLight1337 @ywang96 /vllm/multimodal @DarkLight1337 @ywang96
/vllm/vllm_flash_attn @LucasWilkinson
CMakeLists.txt @tlrmchlsmth CMakeLists.txt @tlrmchlsmth
# vLLM V1 # vLLM V1

View File

@ -14,7 +14,7 @@ body:
description: | description: |
Please run the following and paste the output below. Please run the following and paste the output below.
```sh ```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it. # For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py python collect_env.py
``` ```

View File

@ -14,7 +14,7 @@ body:
description: | description: |
Please run the following and paste the output below. Please run the following and paste the output below.
```sh ```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it. # For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py python collect_env.py
``` ```

View File

@ -14,19 +14,19 @@ body:
description: | description: |
Please run the following and paste the output below. Please run the following and paste the output below.
```sh ```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it. # For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py python collect_env.py
``` ```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues. It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: | value: |
<details> <details>
<summary>The output of <code>python collect_env.py</code></summary> <summary>The output of `python collect_env.py`</summary>
```text ```text
Your output of `python collect_env.py` here Your output of `python collect_env.py` here
``` ```
</details> </details>
validations: validations:
required: true required: true
@ -75,7 +75,7 @@ body:
``` ```
``` ```
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present. The error message you got, with the full traceback.
``` ```
validations: validations:
required: true required: true

View File

@ -9,7 +9,7 @@ body:
value: > value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model. #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
- type: textarea - type: textarea
attributes: attributes:
label: The model to consider. label: The model to consider.

View File

@ -35,7 +35,7 @@ body:
description: | description: |
Please run the following and paste the output below. Please run the following and paste the output below.
```sh ```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it. # For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py python collect_env.py
``` ```

View File

@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
FIX #xxxx (*link existing issues this PR will resolve*) FIX #xxxx (*link existing issues this PR will resolve*)
<!--- pyml disable-next-line no-emphasis-as-heading --> <!--- pyml disable-next-line no-emphasis-as-heading -->
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions) **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**

47
.github/mergify.yml vendored
View File

@ -19,7 +19,7 @@ pull_request_rules:
- files~=\.buildkite/ - files~=\.buildkite/
- files~=^cmake/ - files~=^cmake/
- files=CMakeLists.txt - files=CMakeLists.txt
- files~=^docker/Dockerfile - files~=^Dockerfile
- files~=^requirements.*\.txt - files~=^requirements.*\.txt
- files=setup.py - files=setup.py
actions: actions:
@ -55,19 +55,11 @@ pull_request_rules:
description: Automatically apply structured-output label description: Automatically apply structured-output label
conditions: conditions:
- or: - or:
- files~=^benchmarks/structured_schemas/
- files=benchmarks/benchmark_serving_structured_output.py
- files=benchmarks/run_structured_output_benchmark.sh
- files=docs/source/features/structured_outputs.md
- files=examples/offline_inference/structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^vllm/model_executor/guided_decoding/ - files~=^vllm/model_executor/guided_decoding/
- files=tests/model_executor/test_guided_processors.py - files=tests/model_executor/test_guided_processors.py
- files=tests/entrypoints/llm/test_guided_generate.py - files=tests/entrypoints/llm/test_guided_generate.py
- files~=^tests/v1/structured_output/ - files=benchmarks/benchmark_serving_guided.py
- files=tests/v1/entrypoints/llm/test_guided_generate.py - files=benchmarks/benchmark_guided.py
- files~=^vllm/v1/structured_output/
actions: actions:
label: label:
add: add:
@ -126,28 +118,6 @@ pull_request_rules:
remove: remove:
- tpu - tpu
- name: label-tool-calling
description: Automatically add tool-calling label
conditions:
- or:
- files~=^tests/tool_use/
- files~=^tests/mistral_tool_use/
- files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/
- files=docs/source/features/tool_calling.md
- files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
- files=docs/source/getting_started/examples/chat_with_tools.md
- files~=^examples/tool_chat_*
- files=examples/offline_inference/chat_with_tools.py
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
actions:
label:
add:
- tool-calling
- name: ping author on conflicts and add 'needs-rebase' label - name: ping author on conflicts and add 'needs-rebase' label
conditions: conditions:
- conflict - conflict
@ -163,17 +133,6 @@ pull_request_rules:
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
- name: assign reviewer for tensorizer changes
conditions:
- files~=^vllm/model_executor/model_loader/tensorizer.py
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
- files~=^tests/tensorizer_loader/
actions:
assign:
users:
- "sangstar"
- name: remove 'needs-rebase' label when conflict is resolved - name: remove 'needs-rebase' label when conflict is resolved
conditions: conditions:
- -conflict - -conflict

View File

@ -1,6 +1,4 @@
name: Add label on auto-merge enabled name: Add label on auto-merge enabled
permissions:
pull-requests: write
on: on:
pull_request_target: pull_request_target:
types: types:

View File

@ -2,9 +2,6 @@ name: Lint and Deploy Charts
on: pull_request on: pull_request
permissions:
contents: read
jobs: jobs:
lint-and-deploy: lint-and-deploy:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@ -53,7 +50,7 @@ jobs:
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0 uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
- name: Build the Docker image vllm cpu - name: Build the Docker image vllm cpu
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env . run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
- name: Configuration of docker images, network and namespace for the kind cluster - name: Configuration of docker images, network and namespace for the kind cluster
run: | run: |
@ -69,7 +66,7 @@ jobs:
export AWS_SECRET_ACCESS_KEY=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" & sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
- name: curl test - name: curl test
run: | run: |
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 & kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
@ -82,4 +79,4 @@ jobs:
"max_tokens": 7, "max_tokens": 7,
"temperature": 0 "temperature": 0
}'):$CODE" }'):$CODE"
echo "$CODE" echo "$CODE"

View File

@ -5,9 +5,6 @@ on:
push: push:
branches: [main] branches: [main]
permissions:
contents: read
jobs: jobs:
pre-commit: pre-commit:
runs-on: ubuntu-latest runs-on: ubuntu-latest

View File

@ -1,6 +1,4 @@
name: PR Reminder Comment Bot name: PR Reminder Comment Bot
permissions:
pull-requests: write
on: on:
pull_request_target: pull_request_target:
types: [opened] types: [opened]

5
.gitignore vendored
View File

@ -3,6 +3,7 @@
# vllm-flash-attn built from source # vllm-flash-attn built from source
vllm/vllm_flash_attn/* vllm/vllm_flash_attn/*
!vllm/vllm_flash_attn/fa_utils.py
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
@ -80,7 +81,6 @@ instance/
# Sphinx documentation # Sphinx documentation
docs/_build/ docs/_build/
docs/source/getting_started/examples/ docs/source/getting_started/examples/
docs/source/api/vllm
# PyBuilder # PyBuilder
.pybuilder/ .pybuilder/
@ -203,6 +203,3 @@ benchmarks/**/*.json
# Linting # Linting
actionlint actionlint
shellcheck*/ shellcheck*/
# Ingore moe/marlin_moe gen code
csrc/moe/marlin_moe_wna16/kernel_*

View File

@ -1,6 +1,3 @@
default_install_hook_types:
- pre-commit
- commit-msg
default_stages: default_stages:
- pre-commit # Run locally - pre-commit # Run locally
- manual # Run in CI - manual # Run in CI
@ -11,32 +8,31 @@ repos:
hooks: hooks:
- id: yapf - id: yapf
args: [--in-place, --verbose] args: [--in-place, --verbose]
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7 rev: v0.9.3
hooks: hooks:
- id: ruff - id: ruff
args: [--output-format, github, --fix] args: [--output-format, github, --fix]
- id: ruff-format
files: ^(.buildkite|benchmarks)/.*
- repo: https://github.com/codespell-project/codespell - repo: https://github.com/codespell-project/codespell
rev: v2.4.1 rev: v2.4.0
hooks: hooks:
- id: codespell - id: codespell
additional_dependencies: ['tomli'] additional_dependencies: ['tomli']
args: ['--toml', 'pyproject.toml'] args: ['--toml', 'pyproject.toml']
- repo: https://github.com/PyCQA/isort - repo: https://github.com/PyCQA/isort
rev: 6.0.1 rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
hooks: hooks:
- id: isort - id: isort
- repo: https://github.com/pre-commit/mirrors-clang-format - repo: https://github.com/pre-commit/mirrors-clang-format
rev: v20.1.3 rev: v19.1.7
hooks: hooks:
- id: clang-format - id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*' exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
types_or: [c++, cuda] types_or: [c++, cuda]
args: [--style=file, --verbose] args: [--style=file, --verbose]
- repo: https://github.com/jackdewinter/pymarkdown - repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.29 rev: v0.9.27
hooks: hooks:
- id: pymarkdown - id: pymarkdown
args: [fix] args: [fix]
@ -45,10 +41,10 @@ repos:
hooks: hooks:
- id: actionlint - id: actionlint
- repo: https://github.com/astral-sh/uv-pre-commit - repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.6.17 rev: 0.6.2
hooks: hooks:
- id: pip-compile - id: pip-compile
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128] args: [requirements/test.in, -o, requirements/test.txt]
files: ^requirements/test\.(in|txt)$ files: ^requirements/test\.(in|txt)$
- repo: local - repo: local
hooks: hooks:
@ -103,8 +99,8 @@ repos:
args: args:
- -c - -c
- | - |
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)" printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
fi fi
language: system language: system
verbose: true verbose: true
@ -123,10 +119,6 @@ repos:
language: system language: system
always_run: true always_run: true
pass_filenames: false pass_filenames: false
- id: update-dockerfile-graph
name: Update Dockerfile dependency graph
entry: tools/update-dockerfile-graph.sh
language: script
# Keep `suggestion` last # Keep `suggestion` last
- id: suggestion - id: suggestion
name: Suggestion name: Suggestion

View File

@ -15,6 +15,7 @@ project(vllm_extensions LANGUAGES CXX)
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py) # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM") set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}") message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
@ -33,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
# Supported AMD GPU architectures. # Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201") set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
# #
# Supported/expected torch versions for CUDA/ROCm. # Supported/expected torch versions for CUDA/ROCm.
@ -43,10 +44,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
# #
# Note: the CUDA torch version is derived from pyproject.toml and various # Note: the CUDA torch version is derived from pyproject.toml and various
# requirements.txt files and should be kept consistent. The ROCm torch # requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm # versions are derived from Dockerfile.rocm
# #
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0") set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0") set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
# #
# Try to find python package with an executable that exactly matches # Try to find python package with an executable that exactly matches
@ -229,29 +230,25 @@ set(VLLM_EXT_SRC
"csrc/cache_kernels.cu" "csrc/cache_kernels.cu"
"csrc/attention/paged_attention_v1.cu" "csrc/attention/paged_attention_v1.cu"
"csrc/attention/paged_attention_v2.cu" "csrc/attention/paged_attention_v2.cu"
"csrc/attention/merge_attn_states.cu"
"csrc/attention/vertical_slash_index.cu"
"csrc/pos_encoding_kernels.cu" "csrc/pos_encoding_kernels.cu"
"csrc/activation_kernels.cu" "csrc/activation_kernels.cu"
"csrc/layernorm_kernels.cu" "csrc/layernorm_kernels.cu"
"csrc/layernorm_quant_kernels.cu" "csrc/layernorm_quant_kernels.cu"
"csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/fp8/common.cu" "csrc/quantization/fp8/common.cu"
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu" "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
"csrc/quantization/gguf/gguf_kernel.cu" "csrc/quantization/gguf/gguf_kernel.cu"
"csrc/quantization/activation_kernels.cu"
"csrc/cuda_utils_kernels.cu" "csrc/cuda_utils_kernels.cu"
"csrc/prepare_inputs/advance_step.cu" "csrc/prepare_inputs/advance_step.cu"
"csrc/custom_all_reduce.cu"
"csrc/torch_bindings.cpp") "csrc/torch_bindings.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use") # Please keep this in sync with FetchContent_Declare line below.
set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -269,7 +266,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cutlass cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git GIT_REPOSITORY https://github.com/nvidia/cutlass.git
# Please keep this in sync with CUTLASS_REVISION line above. # Please keep this in sync with CUTLASS_REVISION line above.
GIT_TAG ${CUTLASS_REVISION} GIT_TAG v3.8.0
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history. # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@ -285,14 +282,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/mamba/causal_conv1d/causal_conv1d.cu" "csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu"
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu" "csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu" "csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu" "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/cutlass_extensions/common.cpp" "csrc/cutlass_extensions/common.cpp")
"csrc/attention/mla/cutlass_mla_entry.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}" SRCS "${VLLM_EXT_SRC}"
@ -301,55 +297,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Only build Marlin kernels if we are building for at least some compatible archs. # Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that # Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet. # are not supported by Machete yet.
# 9.0 for latest bf16 atomicAdd PTX cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_ARCHS) if (MARLIN_ARCHS)
#
# For the Marlin kernels we automatically generate sources for various
# preselected input type pairs and schedules.
# Generate sources:
set(MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=$PYTHONPATH
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
RESULT_VARIABLE marlin_generation_result
OUTPUT_VARIABLE marlin_generation_result
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
)
if (NOT marlin_generation_result EQUAL 0)
message(FATAL_ERROR "Marlin generation failed."
" Result: \"${marlin_generation_result}\""
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
else()
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
CACHE STRING "Last run Marlin generate script hash" FORCE)
message(STATUS "Marlin generation completed successfully.")
endif()
else()
message(STATUS "Marlin generation script has not changed, skipping generation.")
endif()
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_ARCHS}")
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
set(MARLIN_SRCS set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu"
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu" "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
@ -421,7 +372,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(SRCS set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
) )
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
@ -446,9 +396,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# #
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x. # kernels for the remaining archs that are not already built for 3x.
# (Build 8.9 for FP8)
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.9+PTX" "${CUDA_ARCHS}") "7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x # subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS) if (SCALED_MM_2X_ARCHS)
@ -499,9 +448,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu" "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/nvfp4_experts_quant.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}") CUDA_ARCHS "${FP4_ARCHS}")
@ -514,32 +461,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(FP4_ARCHS) set(FP4_ARCHS)
endif() endif()
# CUTLASS MLA Archs and flags #
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
set(SRCS
"csrc/attention/mla/cutlass_mla_kernels.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${MLA_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
# Add MLA-specific include directories only to MLA source files
set_source_files_properties(${SRCS}
PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
else()
message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
# clear MLA_ARCHS
set(MLA_ARCHS)
endif()
# CUTLASS MoE kernels # CUTLASS MoE kernels
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
# to compile MoE kernels that use its output. # to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu" set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu") "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
@ -677,54 +605,23 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
CUDA_ARCHS "${CUDA_ARCHS}") CUDA_ARCHS "${CUDA_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
# 9.0 for latest bf16 atomicAdd PTX cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS) if (MARLIN_MOE_ARCHS)
set(MARLIN_MOE_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
"csrc/moe/marlin_moe_ops.cu")
#
# For the Marlin MOE kernels we automatically generate sources for various
# preselected input type pairs and schedules.
# Generate sources:
set(MOE_MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=$PYTHONPATH
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
RESULT_VARIABLE moe_marlin_generation_result
OUTPUT_VARIABLE moe_marlin_generation_output
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
)
if (NOT moe_marlin_generation_result EQUAL 0)
message(FATAL_ERROR "Marlin MOE generation failed."
" Result: \"${moe_marlin_generation_result}\""
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
else()
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
message(STATUS "Marlin MOE generation completed successfully.")
endif()
else()
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
endif()
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${MOE_WNAA16_MARLIN_SRC}" SRCS "${MARLIN_MOE_SRC}"
CUDA_ARCHS "${MARLIN_MOE_ARCHS}") CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC}) list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}") message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
else() else()
message(STATUS "Not building Marlin MOE kernels as no compatible archs found" message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
@ -732,17 +629,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
endif() endif()
if(VLLM_GPU_LANG STREQUAL "CUDA")
set(MOE_PERMUTE_SRC
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
"csrc/moe/moe_permute_unpermute_op.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_PERMUTE_SRC}"
CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
endif()
message(STATUS "Enabling moe extension.") message(STATUS "Enabling moe extension.")
define_gpu_extension_target( define_gpu_extension_target(
_moe_C _moe_C
@ -751,8 +637,6 @@ define_gpu_extension_target(
SOURCES ${VLLM_MOE_EXT_SRC} SOURCES ${VLLM_MOE_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS} COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES} ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
USE_SABI 3 USE_SABI 3
WITH_SOABI) WITH_SOABI)
@ -762,7 +646,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
# #
set(VLLM_ROCM_EXT_SRC set(VLLM_ROCM_EXT_SRC
"csrc/rocm/torch_bindings.cpp" "csrc/rocm/torch_bindings.cpp"
"csrc/rocm/skinny_gemms.cu"
"csrc/rocm/attention.cu") "csrc/rocm/attention.cu")
define_gpu_extension_target( define_gpu_extension_target(

View File

@ -5,11 +5,11 @@
# docs/source/contributing/dockerfile/dockerfile.md and # docs/source/contributing/dockerfile/dockerfile.md and
# docs/source/assets/contributing/dockerfile-stages-dependency.png # docs/source/assets/contributing/dockerfile-stages-dependency.png
ARG CUDA_VERSION=12.8.1 ARG CUDA_VERSION=12.4.1
#################### BASE BUILD IMAGE #################### #################### BASE BUILD IMAGE ####################
# prepare basic build environment # prepare basic build environment
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
ARG CUDA_VERSION=12.8.1 ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
ARG TARGETPLATFORM ARG TARGETPLATFORM
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
@ -19,10 +19,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo \ && apt-get install -y ccache software-properties-common git curl sudo \
&& for i in 1 2 3; do \ && add-apt-repository ppa:deadsnakes/ppa \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
@ -37,7 +34,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels # as it was causing spam when compiling the CUTLASS kernels
@ -70,14 +66,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
COPY requirements/common.txt requirements/common.txt COPY requirements/common.txt requirements/common.txt
COPY requirements/cuda.txt requirements/cuda.txt COPY requirements/cuda.txt requirements/cuda.txt
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/cuda.txt \ uv pip install --system -r requirements/cuda.txt
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# cuda arch list used by torch # cuda arch list used by torch
# can be useful for both `dev` and `test` # can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2 # explicitly set the list to avoid issues with torch 2.2
# see https://github.com/pytorch/pytorch/pull/123243 # see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0+PTX' ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# Override the arch list for flash-attn to reduce the binary size # Override the arch list for flash-attn to reduce the binary size
ARG vllm_fa_cmake_gpu_arches='80-real;90-real' ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
@ -94,11 +89,9 @@ COPY requirements/build.txt requirements/build.txt
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt \ uv pip install --system -r requirements/build.txt
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
COPY . . COPY . .
ARG GIT_REPO_CHECK=0 ARG GIT_REPO_CHECK=0
@ -165,25 +158,19 @@ FROM base as dev
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Workaround for #17068
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
COPY requirements/lint.txt requirements/lint.txt COPY requirements/lint.txt requirements/lint.txt
COPY requirements/test.txt requirements/test.txt COPY requirements/test.txt requirements/test.txt
COPY requirements/dev.txt requirements/dev.txt COPY requirements/dev.txt requirements/dev.txt
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/dev.txt \ uv pip install --system -r requirements/dev.txt
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
#################### DEV IMAGE #################### #################### DEV IMAGE ####################
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
# image with vLLM installed # image with vLLM installed
# TODO: Restore to base image after FlashInfer AOT wheel fixed # TODO: Restore to base image after FlashInfer AOT wheel fixed
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
ARG CUDA_VERSION=12.8.1 ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
WORKDIR /vllm-workspace WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
@ -198,10 +185,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \ && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& for i in 1 2 3; do \ && add-apt-repository ppa:deadsnakes/ppa \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
@ -216,7 +200,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Workaround for https://github.com/openai/triton/issues/2507 and # Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@ -237,8 +220,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Install vllm wheel first, so that torch etc will be installed. # Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \ --mount=type=cache,target=/root/.cache/uv \
uv pip install --system dist/*.whl --verbose \ uv pip install --system dist/*.whl --verbose
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# If we need to build FlashInfer wheel before its release: # If we need to build FlashInfer wheel before its release:
# $ export FLASHINFER_ENABLE_AOT=1 # $ export FLASHINFER_ENABLE_AOT=1
@ -255,18 +237,9 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
. /etc/environment && \ . /etc/environment && \
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
# uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.4/flashinfer_python-0.2.4+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \ uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
# TESTING: install FlashInfer from source to test 2.7.0 final RC
FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX' \
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@e00e8cedbfcb220f328fd36aa8f529f869b01e6b" ; \
fi fi
COPY examples examples COPY examples examples
COPY benchmarks benchmarks
COPY ./vllm/collect_env.py .
RUN --mount=type=cache,target=/root/.cache/uv \
. /etc/environment && \
uv pip list
# Although we build Flashinfer with AOT mode, there's still # Although we build Flashinfer with AOT mode, there's still
# some issues w.r.t. JIT compilation. Therefore we need to # some issues w.r.t. JIT compilation. Therefore we need to
@ -274,8 +247,7 @@ uv pip list
# TODO: Remove this once FlashInfer AOT wheel is fixed # TODO: Remove this once FlashInfer AOT wheel is fixed
COPY requirements/build.txt requirements/build.txt COPY requirements/build.txt requirements/build.txt
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt \ uv pip install --system -r requirements/build.txt
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
@ -289,11 +261,6 @@ ADD . /vllm-workspace/
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Workaround for #17068
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
# install development dependencies (for testing) # install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
@ -322,7 +289,6 @@ RUN mv vllm test_docs/
#################### OPENAI API SERVER #################### #################### OPENAI API SERVER ####################
# base openai image with additional requirements, for any subsequent openai-style images # base openai image with additional requirements, for any subsequent openai-style images
FROM vllm-base AS vllm-openai-base FROM vllm-base AS vllm-openai-base
ARG TARGETPLATFORM
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694

69
Dockerfile.cpu Normal file
View File

@ -0,0 +1,69 @@
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
FROM ubuntu:22.04 AS cpu-test-1
ENV CCACHE_DIR=/root/.cache/ccache
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
# intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install intel-openmp==2025.0.1
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
RUN echo 'ulimit -c 0' >> ~/.bashrc
RUN pip install intel_extension_for_pytorch==2.6.0
WORKDIR /workspace
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
pip install --upgrade pip && \
pip install -r requirements/build.txt
FROM cpu-test-1 AS build
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
pip install -v -r requirements/cpu.txt
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
pip install dist/*.whl && \
rm -rf dist
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -e tests/vllm_test_utils
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -1,4 +1,4 @@
FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
COPY ./ /workspace/vllm COPY ./ /workspace/vllm

View File

@ -1,6 +1,6 @@
# default base image # default base image
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04" ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
FROM $BASE_IMAGE FROM $BASE_IMAGE
@ -21,9 +21,9 @@ VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}/vllm WORKDIR ${APP_MOUNT}/vllm
RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.48.0 -U RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install pytest RUN python3 -m pip install pytest
# uninstall transformers-neuronx package explicitly to avoid version conflict # uninstall transformers-neuronx package explicitly to avoid version conflict

View File

@ -21,8 +21,12 @@ ENV UV_LINK_MODE=copy
# Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel # Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel
# when `--jobs=<N>` is passed with podman build command # when `--jobs=<N>` is passed with podman build command
RUN microdnf install -y openssl-devel dnf \ RUN microdnf install -y openssl-devel dnf \
&& dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \ && dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
&& dnf config-manager --set-enabled codeready-builder-for-rhel-9-ppc64le-rpms \ https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
&& dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os \
&& dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/AppStream/`arch`/os \
&& dnf config-manager --set-enabled crb \
&& dnf install -y \ && dnf install -y \
git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \ git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \
pkgconfig xsimd zeromq-devel kmod findutils protobuf* \ pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
@ -34,7 +38,7 @@ RUN microdnf install -y openssl-devel dnf \
&& ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \ && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
&& python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \ && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
&& python -m pip install -U pip uv \ && python -m pip install -U pip uv \
&& uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \ && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python cmake ninja cython scikit_build_core scikit_build \
&& curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \ && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
&& curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \ && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
&& cd /tmp && touch control && cd /tmp && touch control
@ -122,16 +126,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
FROM base-builder AS cv-builder FROM base-builder AS cv-builder
ARG MAX_JOBS ARG MAX_JOBS
ARG OPENCV_VERSION=86 ARG OPENCV_VERSION=84
# patch for version 4.11.0.86
ARG OPENCV_PATCH=97f3f39
ARG ENABLE_HEADLESS=1 ARG ENABLE_HEADLESS=1
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
source /opt/rh/gcc-toolset-13/enable && \ source /opt/rh/gcc-toolset-13/enable && \
git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \ git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
cd opencv-python && \ cd opencv-python && \
sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \ sed -i 's/"setuptools==59.2.0",/"setuptools<70.0",/g' pyproject.toml && \
cd opencv && git cherry-pick --no-commit $OPENCV_PATCH && cd .. && \
python -m build --wheel --installer=uv --outdir /opencvwheels/ python -m build --wheel --installer=uv --outdir /opencvwheels/
############################################################### ###############################################################
@ -147,15 +148,9 @@ COPY --from=arrow-builder /tmp/control /dev/null
COPY --from=cv-builder /tmp/control /dev/null COPY --from=cv-builder /tmp/control /dev/null
ARG VLLM_TARGET_DEVICE=cpu ARG VLLM_TARGET_DEVICE=cpu
ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
# this step installs vllm and populates uv cache # this step installs vllm and populates uv cache
# with all the transitive dependencies # with all the transitive dependencies
RUN --mount=type=cache,target=/root/.cache/uv \
source /opt/rh/gcc-toolset-13/enable && \
git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
uv pip install maturin && \
uv build --wheel --out-dir /hf_wheels/
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \ --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \ --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
@ -164,7 +159,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
source /opt/rh/gcc-toolset-13/enable && \ source /opt/rh/gcc-toolset-13/enable && \
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \ uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \ sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \ uv pip install pandas pythran pybind11 && \
# sentencepiece.pc is in some pkgconfig inside uv cache # sentencepiece.pc is in some pkgconfig inside uv cache
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \ export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \ uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
@ -243,7 +238,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&& python -m pip install -U pip uv --no-cache \ && python -m pip install -U pip uv --no-cache \
&& curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \ && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
&& make -C /numactl install \ && make -C /numactl install \
&& uv pip install 'cmake<4' \ && uv pip install cmake \
&& cmake --install /lapack/build \ && cmake --install /lapack/build \
&& uv pip uninstall cmake && uv pip uninstall cmake
@ -252,9 +247,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \ --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \ --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \ --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
--mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
--mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \ --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /vllmwheel/*.whl
COPY ./ /workspace/vllm COPY ./ /workspace/vllm
WORKDIR /workspace/vllm WORKDIR /workspace/vllm

View File

@ -114,16 +114,8 @@ COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
ENV TOKENIZERS_PARALLELISM=false ENV TOKENIZERS_PARALLELISM=false
# ENV that can improve safe tensor loading, and end-to-end time
ENV SAFETENSORS_FAST_GPU=1
# User-friendly environment setting for multi-processing to avoid below RuntimeError.
# RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing,
# you must use the 'spawn' start method
# See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
# Performance environment variable. # Performance environment variable.
ENV HIP_FORCE_DEV_KERNARG=1 ENV HIP_FORCE_DEV_KERNARG=1
CMD ["/bin/bash"] CMD ["/bin/bash"]

View File

@ -1,18 +1,18 @@
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
ARG HIPBLASLT_BRANCH="db8e93b4" ARG HIPBLASLT_BRANCH="4d40e36"
ARG HIPBLAS_COMMON_BRANCH="7c1566b" ARG HIPBLAS_COMMON_BRANCH="7c1566b"
ARG LEGACY_HIPBLASLT_OPTION= ARG LEGACY_HIPBLASLT_OPTION=
ARG RCCL_BRANCH="648a58d" ARG RCCL_BRANCH="648a58d"
ARG RCCL_REPO="https://github.com/ROCm/rccl" ARG RCCL_REPO="https://github.com/ROCm/rccl"
ARG TRITON_BRANCH="e5be006" ARG TRITON_BRANCH="e5be006"
ARG TRITON_REPO="https://github.com/triton-lang/triton.git" ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
ARG PYTORCH_BRANCH="295f2ed4" ARG PYTORCH_BRANCH="3a585126"
ARG PYTORCH_VISION_BRANCH="v0.21.0" ARG PYTORCH_VISION_BRANCH="v0.19.1"
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="1a7f4dfa" ARG FA_BRANCH="b7d29fb"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
ARG AITER_BRANCH="5a77249" ARG AITER_BRANCH="21d47a9"
ARG AITER_REPO="https://github.com/ROCm/aiter.git" ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base FROM ${BASE_IMAGE} AS base
@ -20,7 +20,7 @@ FROM ${BASE_IMAGE} AS base
ENV PATH=/opt/rocm/llvm/bin:$PATH ENV PATH=/opt/rocm/llvm/bin:$PATH
ENV ROCM_PATH=/opt/rocm ENV ROCM_PATH=/opt/rocm
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib: ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201 ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
@ -31,11 +31,8 @@ ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies # Install Python and other dependencies
RUN apt-get update -y \ RUN apt-get update -y \
&& apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \ && apt-get install -y software-properties-common git curl sudo vim less \
&& for i in 1 2 3; do \ && add-apt-repository ppa:deadsnakes/ppa \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
python${PYTHON_VERSION}-lib2to3 python-is-python3 \ python${PYTHON_VERSION}-lib2to3 python-is-python3 \
@ -45,7 +42,7 @@ RUN apt-get update -y \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version && python3 --version && python3 -m pip --version
RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
FROM base AS build_hipblaslt FROM base AS build_hipblaslt
ARG HIPBLASLT_BRANCH ARG HIPBLASLT_BRANCH
@ -63,8 +60,7 @@ RUN cd hipBLAS-common \
RUN git clone https://github.com/ROCm/hipBLASLt RUN git clone https://github.com/ROCm/hipBLASLt
RUN cd hipBLASLt \ RUN cd hipBLASLt \
&& git checkout ${HIPBLASLT_BRANCH} \ && git checkout ${HIPBLASLT_BRANCH} \
&& apt-get install -y llvm-dev \ && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
&& ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
&& cd build/release \ && cd build/release \
&& make package && make package
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
@ -114,24 +110,11 @@ RUN git clone ${FA_REPO}
RUN cd flash-attention \ RUN cd flash-attention \
&& git checkout ${FA_BRANCH} \ && git checkout ${FA_BRANCH} \
&& git submodule update --init \ && git submodule update --init \
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \ RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
&& cp /app/vision/dist/*.whl /app/install \ && cp /app/vision/dist/*.whl /app/install \
&& cp /app/flash-attention/dist/*.whl /app/install && cp /app/flash-attention/dist/*.whl /app/install
FROM base AS build_aiter
ARG AITER_BRANCH
ARG AITER_REPO
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl
RUN git clone --recursive ${AITER_REPO}
RUN cd aiter \
&& git checkout ${AITER_BRANCH} \
&& git submodule update --init --recursive \
&& pip install -r requirements.txt
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
FROM base AS final FROM base AS final
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
dpkg -i /install/*deb \ dpkg -i /install/*deb \
@ -147,12 +130,19 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
pip install /install/*.whl pip install /install/*.whl
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl pip install /install/*.whl
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
pip install /install/*.whl ARG AITER_REPO
ARG AITER_BRANCH
RUN git clone --recursive ${AITER_REPO}
RUN cd aiter \
&& git checkout ${AITER_BRANCH} \
&& git submodule update --init --recursive \
&& pip install -r requirements.txt \
&& PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter
ARG BASE_IMAGE ARG BASE_IMAGE
ARG HIPBLAS_COMMON_BRANCH
ARG HIPBLASLT_BRANCH ARG HIPBLASLT_BRANCH
ARG HIPBLAS_COMMON_BRANCH
ARG LEGACY_HIPBLASLT_OPTION ARG LEGACY_HIPBLASLT_OPTION
ARG RCCL_BRANCH ARG RCCL_BRANCH
ARG RCCL_REPO ARG RCCL_REPO
@ -164,8 +154,6 @@ ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO ARG PYTORCH_VISION_REPO
ARG FA_BRANCH ARG FA_BRANCH
ARG FA_REPO ARG FA_REPO
ARG AITER_BRANCH
ARG AITER_REPO
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \ && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \ && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
@ -179,5 +167,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \ && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt

View File

@ -16,7 +16,7 @@ ENV LANG=C.UTF-8 \
RUN microdnf install -y \ RUN microdnf install -y \
which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \ which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \ libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy && \ openssl-devel openblas openblas-devel autoconf automake libtool cmake && \
microdnf clean all microdnf clean all
# Python Installation # Python Installation
@ -58,7 +58,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
cd ../../python && \ cd ../../python && \
export PYARROW_PARALLEL=4 && \ export PYARROW_PARALLEL=4 && \
export ARROW_BUILD_TYPE=release && \ export ARROW_BUILD_TYPE=release && \
uv pip install -r requirements-build.txt && \ uv pip install -r requirements/build.txt && \
python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel
FROM python-install AS numa-build FROM python-install AS numa-build
@ -96,22 +96,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \ uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
python setup.py bdist_wheel python setup.py bdist_wheel
FROM python-install AS hf-xet-builder
# Install hf-xet
WORKDIR /tmp
ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
git clone https://github.com/huggingface/xet-core.git && \
cd xet-core/hf_xet/ && \
uv pip install maturin patchelf && \
python -m maturin build --release --out dist && \
mkdir -p /tmp/hf-xet/dist && \
cp dist/*.whl /tmp/hf-xet/dist/
# Final build stage # Final build stage
FROM python-install AS vllm-cpu FROM python-install AS vllm-cpu
ARG PYTHON_VERSION ARG PYTHON_VERSION
@ -123,7 +107,6 @@ ENV UV_LINK_MODE=copy
ENV CARGO_HOME=/root/.cargo ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
COPY . /workspace/vllm COPY . /workspace/vllm
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
@ -137,15 +120,12 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \ --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
--mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \ --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
--mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \ --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
--mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
sed -i '/^torch/d' requirements/build.txt && \ sed -i '/^torch/d' requirements/build.txt && \
ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \ ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \ VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
uv pip install -v \ uv pip install -v \
$ARROW_WHL_FILE \ $ARROW_WHL_FILE \
$VISION_WHL_FILE \ $VISION_WHL_FILE \
$HF_XET_WHL_FILE \
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \ --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
--index-strategy unsafe-best-match \ --index-strategy unsafe-best-match \
-r requirements/build.txt \ -r requirements/build.txt \
@ -169,5 +149,4 @@ USER 2000
WORKDIR /home/vllm WORKDIR /home/vllm
# Set the default entrypoint # Set the default entrypoint
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -23,7 +23,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \ --mount=type=bind,source=.git,target=.git \
python3 -m pip install \ python3 -m pip install \
-r requirements/tpu.txt -r requirements/tpu.txt
RUN python3 -m pip install -e . RUN python3 setup.py develop
# install development dependencies (for testing) # install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils RUN python3 -m pip install -e tests/vllm_test_utils

View File

@ -40,6 +40,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \ --mount=type=bind,source=.git,target=.git \
python3 setup.py install python3 setup.py install
# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install intel-extension-for-pytorch==2.6.10+xpu \
--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
CMD ["/bin/bash"] CMD ["/bin/bash"]
FROM vllm-base AS vllm-openai FROM vllm-base AS vllm-openai

View File

@ -10,26 +10,29 @@ Easy, fast, and cheap LLM serving for everyone
</h3> </h3>
<p align="center"> <p align="center">
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> | | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
</p> </p>
--- ---
[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
---
*Latest News* 🔥 *Latest News* 🔥
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/). - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing). - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html). - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
<details> <details>
<summary>Previous News</summary> <summary>Previous News</summary>
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing). - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users! - [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
@ -74,7 +77,7 @@ vLLM is flexible and easy to use with:
- OpenAI-compatible API server - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron. - Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
- Prefix caching support - Prefix caching support
- Multi-LoRA support - Multi-lora support
vLLM seamlessly supports most popular open-source models on HuggingFace, including: vLLM seamlessly supports most popular open-source models on HuggingFace, including:
- Transformer-like LLMs (e.g., Llama) - Transformer-like LLMs (e.g., Llama)
@ -100,7 +103,7 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
## Contributing ## Contributing
We welcome and value any contributions and collaborations. We welcome and value any contributions and collaborations.
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved. Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
## Sponsors ## Sponsors
@ -123,7 +126,6 @@ Compute Resources:
- Databricks - Databricks
- DeepInfra - DeepInfra
- Google Cloud - Google Cloud
- Intel
- Lambda Lab - Lambda Lab
- Nebius - Nebius
- Novita AI - Novita AI

View File

@ -41,39 +41,29 @@ become available.
<td><code>synthetic</code></td> <td><code>synthetic</code></td>
</tr> </tr>
<tr> <tr>
<td><strong>HuggingFace-VisionArena</strong></td> <td><strong>HuggingFace</strong></td>
<td style="text-align: center;"></td> <td style="text-align: center;">🟡</td>
<td style="text-align: center;"></td> <td style="text-align: center;">🟡</td>
<td><code>lmarena-ai/VisionArena-Chat</code></td> <td>Specify your dataset path on HuggingFace</td>
</tr> </tr>
<tr> <tr>
<td><strong>HuggingFace-InstructCoder</strong></td> <td><strong>VisionArena</strong></td>
<td style="text-align: center;"></td> <td style="text-align: center;"></td>
<td style="text-align: center;"></td> <td style="text-align: center;"></td>
<td><code>likaixin/InstructCoder</code></td> <td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
</tr>
<tr>
<td><strong>HuggingFace-AIMO</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
</tr>
<tr>
<td><strong>HuggingFace-Other</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
✅: supported ✅: supported
🟡: Partial support
🚧: to be supported 🚧: to be supported
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf` 🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
If you need support for other dataset formats, please consider contributing.
**Note**: VisionArenas `dataset-name` should be set to `hf`
--- ---
## Example - Online Benchmark ## Example - Online Benchmark
@ -81,7 +71,8 @@ become available.
First start serving your model First start serving your model
```bash ```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
vllm serve ${MODEL_NAME} --disable-log-requests
``` ```
Then run the benchmarking script Then run the benchmarking script
@ -89,13 +80,12 @@ Then run the benchmarking script
```bash ```bash
# download dataset # download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 vllm/benchmarks/benchmark_serving.py \ MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
--backend vllm \ NUM_PROMPTS=10
--model NousResearch/Hermes-3-Llama-3.1-8B \ BACKEND="vllm"
--endpoint /v1/completions \ DATASET_NAME="sharegpt"
--dataset-name sharegpt \ DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \ python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
--num-prompts 10
``` ```
If successful, you will see the following output If successful, you will see the following output
@ -132,105 +122,88 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
``` ```
```bash ```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT='train'
python3 vllm/benchmarks/benchmark_serving.py \ python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \ --backend "${BACKEND}" \
--model Qwen/Qwen2-VL-7B-Instruct \ --model "${MODEL_NAME}" \
--endpoint /v1/chat/completions \ --endpoint "/v1/chat/completions" \
--dataset-name hf \ --dataset-name "${DATASET_NAME}" \
--dataset-path lmarena-ai/VisionArena-Chat \ --dataset-path "${DATASET_PATH}" \
--hf-split train \ --hf-split "${DATASET_SPLIT}" \
--num-prompts 1000 --num-prompts "${NUM_PROMPTS}"
``` ```
### InstructCoder Benchmark with Speculative Decoding ### HuggingFaceDataset Examples
``` bash Currently, HuggingFaceDataset only supports dataset formats
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
--speculative-model "[ngram]" \ formats, please consider contributing.
--ngram_prompt_lookup_min 2 \
--ngram-prompt-lookup-max 5 \
--num_speculative_tokens 5
```
``` bash
python3 benchmarks/benchmark_serving.py \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name hf \
--dataset-path likaixin/InstructCoder \
--num-prompts 2048
```
### Other HuggingFaceDataset Examples
```bash ```bash
# need a model with vision capability here
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
``` ```
**`lmms-lab/LLaVA-OneVision-Data`** **`lmms-lab/LLaVA-OneVision-Data`**
```bash ```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
DATASET_SPLIT='train'
DATASET_SUBSET='chart2text(cauldron)'
python3 vllm/benchmarks/benchmark_serving.py \ python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \ --backend "${BACKEND}" \
--model Qwen/Qwen2-VL-7B-Instruct \ --model "${MODEL_NAME}" \
--endpoint /v1/chat/completions \ --endpoint "/v1/chat/completions" \
--dataset-name hf \ --dataset-name "${DATASET_NAME}" \
--dataset-path lmms-lab/LLaVA-OneVision-Data \ --dataset-path "${DATASET_PATH}" \
--hf-split train \ --hf-split "${DATASET_SPLIT}" \
--hf-subset "chart2text(cauldron)" \ --num-prompts "${NUM_PROMPTS}" \
--num-prompts 10 --hf-subset "${DATASET_SUBSET}"
``` ```
**`Aeala/ShareGPT_Vicuna_unfiltered`** **`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash ```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
DATASET_SPLIT='train'
python3 vllm/benchmarks/benchmark_serving.py \ python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \ --backend "${BACKEND}" \
--model Qwen/Qwen2-VL-7B-Instruct \ --model "${MODEL_NAME}" \
--endpoint /v1/chat/completions \ --endpoint "/v1/chat/completions" \
--dataset-name hf \ --dataset-name "${DATASET_NAME}" \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \ --dataset-path "${DATASET_PATH}" \
--hf-split train \ --hf-split "${DATASET_SPLIT}" \
--num-prompts 10 --num-prompts "${NUM_PROMPTS}" \
```
**`AI-MO/aimo-validation-aime`**
``` bash
python3 vllm/benchmarks/benchmark_serving.py \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--num-prompts 10 \
--seed 42
```
### Running With Sampling Parameters
When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command:
```bash
python3 vllm/benchmarks/benchmark_serving.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
--dataset-name sharegpt \
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--top-k 10 \
--top-p 0.9 \
--temperature 0.5 \
--num-prompts 10
``` ```
--- ---
## Example - Offline Throughput Benchmark ## Example - Offline Throughput Benchmark
```bash ```bash
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
NUM_PROMPTS=10
DATASET_NAME="sonnet"
DATASET_PATH="vllm/benchmarks/sonnet.txt"
python3 vllm/benchmarks/benchmark_throughput.py \ python3 vllm/benchmarks/benchmark_throughput.py \
--model NousResearch/Hermes-3-Llama-3.1-8B \ --model "${MODEL_NAME}" \
--dataset-name sonnet \ --dataset-name "${DATASET_NAME}" \
--dataset-path vllm/benchmarks/sonnet.txt \ --dataset-path "${DATASET_PATH}" \
--num-prompts 10 --num-prompts "${NUM_PROMPTS}"
``` ```
If successful, you will see the following output If successful, you will see the following output
@ -244,13 +217,19 @@ Total num output tokens: 1500
### VisionArena Benchmark for Vision Language Models ### VisionArena Benchmark for Vision Language Models
``` bash ``` bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT="train"
python3 vllm/benchmarks/benchmark_throughput.py \ python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \ --model "${MODEL_NAME}" \
--backend vllm-chat \ --backend "vllm-chat" \
--dataset-name hf \ --dataset-name "${DATASET_NAME}" \
--dataset-path lmarena-ai/VisionArena-Chat \ --dataset-path "${DATASET_PATH}" \
--num-prompts 1000 \ --num-prompts "${NUM_PROMPTS}" \
--hf-split train --hf-split "${DATASET_SPLIT}"
``` ```
The `num prompt tokens` now includes image token counts The `num prompt tokens` now includes image token counts
@ -261,83 +240,29 @@ Total num prompt tokens: 14527
Total num output tokens: 1280 Total num output tokens: 1280
``` ```
### InstructCoder Benchmark with Speculative Decoding
``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_USE_V1=1 \
python3 vllm/benchmarks/benchmark_throughput.py \
--dataset-name=hf \
--dataset-path=likaixin/InstructCoder \
--model=meta-llama/Meta-Llama-3-8B-Instruct \
--input-len=1000 \
--output-len=100 \
--num-prompts=2048 \
--async-engine \
--speculative-model="[ngram]" \
--ngram_prompt_lookup_min=2 \
--ngram-prompt-lookup-max=5 \
--num_speculative_tokens=5
```
```
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
Total num prompt tokens: 261136
Total num output tokens: 204800
```
### Other HuggingFaceDataset Examples
**`lmms-lab/LLaVA-OneVision-Data`**
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path lmms-lab/LLaVA-OneVision-Data \
--hf-split train \
--hf-subset "chart2text(cauldron)" \
--num-prompts 10
```
**`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
--hf-split train \
--num-prompts 10
```
**`AI-MO/aimo-validation-aime`**
```bash
python3 benchmarks/benchmark_throughput.py \
--model Qwen/QwQ-32B \
--backend vllm \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--hf-split train \
--num-prompts 10
```
### Benchmark with LoRA Adapters ### Benchmark with LoRA Adapters
``` bash ``` bash
# download dataset # download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
MODEL_NAME="meta-llama/Llama-2-7b-hf"
BACKEND="vllm"
DATASET_NAME="sharegpt"
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
NUM_PROMPTS=10
MAX_LORAS=2
MAX_LORA_RANK=8
ENABLE_LORA="--enable-lora"
LORA_PATH="yard1/llama-2-7b-sql-lora-test"
python3 vllm/benchmarks/benchmark_throughput.py \ python3 vllm/benchmarks/benchmark_throughput.py \
--model meta-llama/Llama-2-7b-hf \ --model "${MODEL_NAME}" \
--backend vllm \ --backend "${BACKEND}" \
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \ --dataset_path "${DATASET_PATH}" \
--dataset_name sharegpt \ --dataset_name "${DATASET_NAME}" \
--num-prompts 10 \ --num-prompts "${NUM_PROMPTS}" \
--max-loras 2 \ --max-loras "${MAX_LORAS}" \
--max-lora-rank 8 \ --max-lora-rank "${MAX_LORA_RANK}" \
--enable-lora \ ${ENABLE_LORA} \
--lora-path yard1/llama-2-7b-sql-lora-test --lora-path "${LORA_PATH}"
``` ```

View File

@ -1,212 +0,0 @@
#!/bin/bash
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
# The current server parameter combination is max_num_seqs and max_num_batched_tokens
# It also supports additional requirement: e2e latency and prefix cache.
# Pre-requisite:
# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version.
# 2. If the model is customized, replace the MODEL's config with the customized config.
# 3. Set variables (ALL REQUIRED)
# BASE: your directory for vllm repo
# MODEL: the model served by vllm
# DOWNLOAD_DIR: directory to download and load model weights.
# INPUT_LEN: request input len
# OUTPUT_LEN: request output len
# MIN_CACHE_HIT_PCT: prefix cache rate
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
# 5. The final result will be saved in RESULT file.
# Example use cases
# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
# 3. If we want to reach 60% prefix cache, what's the best server parameter?
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
TAG=$(date +"%Y_%m_%d_%H_%M")
BASE=""
MODEL="meta-llama/Llama-3.1-8B-Instruct"
DOWNLOAD_DIR=""
INPUT_LEN=4000
OUTPUT_LEN=16
MIN_CACHE_HIT_PCT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
echo "result file$ $RESULT"
echo "model: $MODEL"
echo
rm -rf $LOG_FOLDER
mkdir -p $LOG_FOLDER
cd "$BASE/vllm"
# create sonnet-4x.txt so that we can sample 2048 tokens for input
echo "" > benchmarks/sonnet_4x.txt
for _ in {1..4}
do
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
done
pip install datasets
current_hash=$(git rev-parse HEAD)
echo "hash:$current_hash" >> "$RESULT"
echo "current_hash: $current_hash"
best_throughput=0
best_max_num_seqs=0
best_num_batched_tokens=0
best_goodput=0
run_benchmark() {
local max_num_seqs=$1
local max_num_batched_tokens=$2
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
echo "vllm_log: $vllm_log"
echo
rm -f $vllm_log
# start the server
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
--disable-log-requests \
--port 8004 \
--gpu-memory-utilization 0.98 \
--max-num-seqs $max_num_seqs \
--max-num-batched-tokens $max_num_batched_tokens \
--tensor-parallel-size 1 \
--enable-prefix-caching \
--load-format dummy \
--download-dir $DOWNLOAD_DIR \
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
echo "wait for 10 minutes.."
echo
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
if grep -Fq "Application startup complete" "$vllm_log"; then
echo "Application started"
server_started=1
break
else
# echo "wait for 10 seconds..."
sleep 10
fi
done
if (( ! server_started )); then
echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
echo "pkill -f vllm"
echo
pkill vllm
sleep 10
return 1
fi
echo "run benchmark test..."
echo
meet_latency_requirement=0
# get a basic qps by using request-rate inf
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore-eos \
--disable-tqdm \
--request-rate inf \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--sonnet-prefix-len $prefix_len \
--port 8004 > "$bm_log"
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
meet_latency_requirement=1
fi
if (( ! meet_latency_requirement )); then
# start from request-rate as int(through_put) + 1
request_rate=$((${through_put%.*} + 1))
while ((request_rate > 0)); do
# clear prefix cache
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore_eos \
--disable-tqdm \
--request-rate $request_rate \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--sonnet-prefix-len $prefix_len \
--port 8004 > "$bm_log"
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
meet_latency_requirement=1
break
fi
request_rate=$((request_rate-1))
done
fi
# write the results and update the best result.
if ((meet_latency_requirement)); then
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
best_throughput=$through_put
best_max_num_seqs=$max_num_seqs
best_num_batched_tokens=$max_num_batched_tokens
best_goodput=$goodput
fi
else
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
fi
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
echo "pkill -f vllm"
echo
pkill vllm
sleep 10
rm -f $vllm_log
printf '=%.0s' $(seq 1 20)
return 0
}
num_seqs_list="128 256"
num_batched_tokens_list="512 1024 2048 4096"
for num_seqs in $num_seqs_list; do
for num_batched_tokens in $num_batched_tokens_list; do
run_benchmark $num_seqs $num_batched_tokens
exit 0
done
done
echo "finish permutations"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import io
import json import json
import os import os
import sys import sys
@ -12,7 +11,8 @@ from typing import Optional, Union
import aiohttp import aiohttp
import huggingface_hub.constants import huggingface_hub.constants
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast)
# NOTE(simon): do not import vLLM here so the benchmark script # NOTE(simon): do not import vLLM here so the benchmark script
# can run without vLLM installed. # can run without vLLM installed.
@ -32,7 +32,6 @@ class RequestFuncInput:
extra_body: Optional[dict] = None extra_body: Optional[dict] = None
multi_modal_content: Optional[dict] = None multi_modal_content: Optional[dict] = None
ignore_eos: bool = False ignore_eos: bool = False
language: Optional[str] = None
@dataclass @dataclass
@ -42,7 +41,8 @@ class RequestFuncOutput:
latency: float = 0.0 latency: float = 0.0
output_tokens: int = 0 output_tokens: int = 0
ttft: float = 0.0 # Time to first token ttft: float = 0.0 # Time to first token
itl: list[float] = field(default_factory=list) # list of inter-token latencies itl: list[float] = field(
default_factory=list) # list of inter-token latencies
tpot: float = 0.0 # avg next-token latencies tpot: float = 0.0 # avg next-token latencies
prompt_len: int = 0 prompt_len: int = 0
error: str = "" error: str = ""
@ -55,9 +55,8 @@ async def async_request_tgi(
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith("generate_stream") assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession( async with aiohttp.ClientSession(trust_env=True,
trust_env=True, timeout=AIOHTTP_TIMEOUT timeout=AIOHTTP_TIMEOUT) as session:
) as session:
params = { params = {
"max_new_tokens": request_func_input.output_len, "max_new_tokens": request_func_input.output_len,
"do_sample": True, "do_sample": True,
@ -104,7 +103,8 @@ async def async_request_tgi(
# Decoding phase # Decoding phase
else: else:
output.itl.append(timestamp - most_recent_timestamp) output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp most_recent_timestamp = timestamp
@ -131,9 +131,8 @@ async def async_request_trt_llm(
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith("generate_stream") assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession( async with aiohttp.ClientSession(trust_env=True,
trust_env=True, timeout=AIOHTTP_TIMEOUT timeout=AIOHTTP_TIMEOUT) as session:
) as session:
payload = { payload = {
"accumulate_tokens": True, "accumulate_tokens": True,
"text_input": request_func_input.prompt, "text_input": request_func_input.prompt,
@ -158,7 +157,8 @@ async def async_request_trt_llm(
if not chunk_bytes: if not chunk_bytes:
continue continue
chunk = chunk_bytes.decode("utf-8").removeprefix("data:") chunk = chunk_bytes.decode("utf-8").removeprefix(
"data:")
data = json.loads(chunk) data = json.loads(chunk)
output.generated_text += data["text_output"] output.generated_text += data["text_output"]
@ -170,7 +170,8 @@ async def async_request_trt_llm(
# Decoding phase # Decoding phase
else: else:
output.itl.append(timestamp - most_recent_timestamp) output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp most_recent_timestamp = timestamp
@ -194,11 +195,10 @@ async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
async with aiohttp.ClientSession( async with aiohttp.ClientSession(trust_env=True,
trust_env=True, timeout=AIOHTTP_TIMEOUT timeout=AIOHTTP_TIMEOUT) as session:
) as session:
payload = { payload = {
"model": request_func_input.model,
"prompt": request_func_input.prompt, "prompt": request_func_input.prompt,
"max_tokens": request_func_input.output_len, "max_tokens": request_func_input.output_len,
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp. "temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
@ -214,22 +214,12 @@ async def async_request_deepspeed_mii(
st = time.perf_counter() st = time.perf_counter()
try: try:
async with session.post( async with session.post(url=request_func_input.api_url,
url=request_func_input.api_url, json=payload json=payload) as response:
) as response:
if response.status == 200: if response.status == 200:
parsed_resp = await response.json() parsed_resp = await response.json()
output.latency = time.perf_counter() - st output.latency = time.perf_counter() - st
if "choices" in parsed_resp: output.generated_text = parsed_resp["text"][0]
output.generated_text = parsed_resp["choices"][0]["text"]
elif "text" in parsed_resp:
output.generated_text = parsed_resp["text"][0]
else:
output.error = (
"Unexpected response format: "
"neither 'choices' nor 'text' found"
)
output.success = False
output.success = True output.success = True
else: else:
output.error = response.reason or "" output.error = response.reason or ""
@ -249,20 +239,17 @@ async def async_request_openai_completions(
pbar: Optional[tqdm] = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), ( assert api_url.endswith(
"OpenAI Completions API URL must end with 'completions' or 'profile'." ("completions", "profile")
) ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
async with aiohttp.ClientSession( async with aiohttp.ClientSession(trust_env=True,
trust_env=True, timeout=AIOHTTP_TIMEOUT timeout=AIOHTTP_TIMEOUT) as session:
) as session:
payload = { payload = {
"model": request_func_input.model_name "model": request_func_input.model_name \
if request_func_input.model_name if request_func_input.model_name else request_func_input.model,
else request_func_input.model,
"prompt": request_func_input.prompt, "prompt": request_func_input.prompt,
"temperature": 0.0, "temperature": 0.0,
"repetition_penalty": 1.0,
"max_tokens": request_func_input.output_len, "max_tokens": request_func_input.output_len,
"logprobs": request_func_input.logprobs, "logprobs": request_func_input.logprobs,
"stream": True, "stream": True,
@ -274,7 +261,9 @@ async def async_request_openai_completions(
payload["ignore_eos"] = request_func_input.ignore_eos payload["ignore_eos"] = request_func_input.ignore_eos
if request_func_input.extra_body: if request_func_input.extra_body:
payload.update(request_func_input.extra_body) payload.update(request_func_input.extra_body)
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
output = RequestFuncOutput() output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len output.prompt_len = request_func_input.prompt_len
@ -283,9 +272,8 @@ async def async_request_openai_completions(
st = time.perf_counter() st = time.perf_counter()
most_recent_timestamp = st most_recent_timestamp = st
try: try:
async with session.post( async with session.post(url=api_url, json=payload,
url=api_url, json=payload, headers=headers headers=headers) as response:
) as response:
if response.status == 200: if response.status == 200:
first_chunk_received = False first_chunk_received = False
async for chunk_bytes in response.content: async for chunk_bytes in response.content:
@ -293,7 +281,8 @@ async def async_request_openai_completions(
if not chunk_bytes: if not chunk_bytes:
continue continue
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]": if chunk != "[DONE]":
data = json.loads(chunk) data = json.loads(chunk)
@ -313,20 +302,21 @@ async def async_request_openai_completions(
# Decoding phase # Decoding phase
else: else:
output.itl.append(timestamp - most_recent_timestamp) output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp most_recent_timestamp = timestamp
generated_text += text or "" generated_text += text or ""
elif usage := data.get("usage"): elif usage := data.get("usage"):
output.output_tokens = usage.get("completion_tokens") output.output_tokens = usage.get(
"completion_tokens")
if first_chunk_received: if first_chunk_received:
output.success = True output.success = True
else: else:
output.success = False output.success = False
output.error = ( output.error = (
"Never received a valid chunk to calculate TTFT." "Never received a valid chunk to calculate TTFT."
"This response will be marked as failed!" "This response will be marked as failed!")
)
output.generated_text = generated_text output.generated_text = generated_text
output.latency = most_recent_timestamp - st output.latency = most_recent_timestamp - st
else: else:
@ -347,22 +337,23 @@ async def async_request_openai_chat_completions(
pbar: Optional[tqdm] = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith(("chat/completions", "profile")), ( assert api_url.endswith(
"OpenAI Chat Completions API URL must end with 'chat/completions'." ("chat/completions", "profile")
) ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
async with aiohttp.ClientSession( async with aiohttp.ClientSession(trust_env=True,
trust_env=True, timeout=AIOHTTP_TIMEOUT timeout=AIOHTTP_TIMEOUT) as session:
) as session:
content = [{"type": "text", "text": request_func_input.prompt}] content = [{"type": "text", "text": request_func_input.prompt}]
if request_func_input.multi_modal_content: if request_func_input.multi_modal_content:
content.append(request_func_input.multi_modal_content) content.append(request_func_input.multi_modal_content)
payload = { payload = {
"model": request_func_input.model_name "model": request_func_input.model_name \
if request_func_input.model_name if request_func_input.model_name else request_func_input.model,
else request_func_input.model,
"messages": [ "messages": [
{"role": "user", "content": content}, {
"role": "user",
"content": content
},
], ],
"temperature": 0.0, "temperature": 0.0,
"max_completion_tokens": request_func_input.output_len, "max_completion_tokens": request_func_input.output_len,
@ -388,16 +379,16 @@ async def async_request_openai_chat_completions(
st = time.perf_counter() st = time.perf_counter()
most_recent_timestamp = st most_recent_timestamp = st
try: try:
async with session.post( async with session.post(url=api_url, json=payload,
url=api_url, json=payload, headers=headers headers=headers) as response:
) as response:
if response.status == 200: if response.status == 200:
async for chunk_bytes in response.content: async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip() chunk_bytes = chunk_bytes.strip()
if not chunk_bytes: if not chunk_bytes:
continue continue
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]": if chunk != "[DONE]":
timestamp = time.perf_counter() timestamp = time.perf_counter()
data = json.loads(chunk) data = json.loads(chunk)
@ -411,11 +402,13 @@ async def async_request_openai_chat_completions(
# Decoding phase # Decoding phase
else: else:
output.itl.append(timestamp - most_recent_timestamp) output.itl.append(timestamp -
most_recent_timestamp)
generated_text += content or "" generated_text += content or ""
elif usage := data.get("usage"): elif usage := data.get("usage"):
output.output_tokens = usage.get("completion_tokens") output.output_tokens = usage.get(
"completion_tokens")
most_recent_timestamp = timestamp most_recent_timestamp = timestamp
@ -435,115 +428,8 @@ async def async_request_openai_chat_completions(
return output return output
async def async_request_openai_audio(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
# Lazy import without PlaceholderModule to avoid vllm dep.
import soundfile
api_url = request_func_input.api_url
assert api_url.endswith(("transcriptions", "translations")), (
"OpenAI Chat Completions API URL must end with 'transcriptions' "
)
"or `translations`."
async with aiohttp.ClientSession(
trust_env=True, timeout=AIOHTTP_TIMEOUT
) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
payload = {
"model": request_func_input.model_name
if request_func_input.model_name
else request_func_input.model,
"temperature": 0.0,
"max_completion_tokens": request_func_input.output_len,
"stream": True,
"language": "en",
# Flattened due to multipart/form-data
"stream_include_usage": True,
"stream_continuous_usage_stats": True,
}
if request_func_input.extra_body:
payload.update(request_func_input.extra_body)
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
# Send audio file
def to_bytes(y, sr):
buffer = io.BytesIO()
soundfile.write(buffer, y, sr, format="WAV")
buffer.seek(0)
return buffer
with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
form = aiohttp.FormData()
form.add_field("file", f, content_type="audio/wav")
for key, value in payload.items():
form.add_field(key, str(value))
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
generated_text = ""
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(
url=api_url, data=form, headers=headers
) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk != "[DONE]":
timestamp = time.perf_counter()
data = json.loads(chunk)
if choices := data.get("choices"):
content = choices[0]["delta"].get("content")
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(
timestamp - most_recent_timestamp
)
generated_text += content or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens"
)
most_recent_timestamp = timestamp
output.generated_text = generated_text
output.success = True
output.latency = most_recent_timestamp - st
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
def get_model(pretrained_model_name_or_path: str) -> str: def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true": if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download from modelscope import snapshot_download
from vllm.model_executor.model_loader.weight_utils import get_lock from vllm.model_executor.model_loader.weight_utils import get_lock
@ -554,8 +440,7 @@ def get_model(pretrained_model_name_or_path: str) -> str:
model_path = snapshot_download( model_path = snapshot_download(
model_id=pretrained_model_name_or_path, model_id=pretrained_model_name_or_path,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"], ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
)
return model_path return model_path
return pretrained_model_name_or_path return pretrained_model_name_or_path
@ -568,23 +453,23 @@ def get_tokenizer(
**kwargs, **kwargs,
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
if pretrained_model_name_or_path is not None and not os.path.exists( if pretrained_model_name_or_path is not None and not os.path.exists(
pretrained_model_name_or_path pretrained_model_name_or_path):
): pretrained_model_name_or_path = get_model(
pretrained_model_name_or_path = get_model(pretrained_model_name_or_path) pretrained_model_name_or_path)
if tokenizer_mode == "slow": if tokenizer_mode == "slow":
if kwargs.get("use_fast", False): if kwargs.get("use_fast", False):
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.") raise ValueError(
"Cannot use the fast tokenizer in slow tokenizer mode.")
kwargs["use_fast"] = False kwargs["use_fast"] = False
if tokenizer_mode == "mistral": if tokenizer_mode == "mistral":
try: try:
from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.transformers_utils.tokenizer import MistralTokenizer
except ImportError as e: except ImportError as e:
raise ImportError( raise ImportError("MistralTokenizer requires vllm package.\n"
"MistralTokenizer requires vllm package.\n" "Please install it with `pip install vllm` "
"Please install it with `pip install vllm` " "to use mistral tokenizer mode.") from e
"to use mistral tokenizer mode." return MistralTokenizer.from_pretrained(
) from e str(pretrained_model_name_or_path))
return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
else: else:
return AutoTokenizer.from_pretrained( return AutoTokenizer.from_pretrained(
pretrained_model_name_or_path, pretrained_model_name_or_path,
@ -600,14 +485,7 @@ ASYNC_REQUEST_FUNCS = {
"deepspeed-mii": async_request_deepspeed_mii, "deepspeed-mii": async_request_deepspeed_mii,
"openai": async_request_openai_completions, "openai": async_request_openai_completions,
"openai-chat": async_request_openai_chat_completions, "openai-chat": async_request_openai_chat_completions,
"openai-audio": async_request_openai_audio,
"tensorrt-llm": async_request_trt_llm, "tensorrt-llm": async_request_trt_llm,
"scalellm": async_request_openai_completions, "scalellm": async_request_openai_completions,
"sglang": async_request_openai_completions, "sglang": async_request_openai_completions,
} }
OPENAI_COMPATIBLE_BACKENDS = [
k
for k, v in ASYNC_REQUEST_FUNCS.items()
if v in (async_request_openai_completions, async_request_openai_chat_completions)
]

View File

@ -23,8 +23,7 @@ from abc import ABC, abstractmethod
from collections.abc import Mapping from collections.abc import Mapping
from dataclasses import dataclass from dataclasses import dataclass
from functools import cache from functools import cache
from io import BytesIO from typing import Any, Optional, Union
from typing import Any, Callable, Optional, Union
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@ -64,7 +63,6 @@ class SampleRequest:
class BenchmarkDataset(ABC): class BenchmarkDataset(ABC):
DEFAULT_SEED = 0 DEFAULT_SEED = 0
IS_MULTIMODAL = False
def __init__( def __init__(
self, self,
@ -82,12 +80,14 @@ class BenchmarkDataset(ABC):
self.dataset_path = dataset_path self.dataset_path = dataset_path
# Set the random seed, ensuring that a None value is replaced with the # Set the random seed, ensuring that a None value is replaced with the
# default seed. # default seed.
self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED self.random_seed = (random_seed
if random_seed is not None else self.DEFAULT_SEED)
self.data = None self.data = None
def apply_multimodal_chat_transformation( def apply_multimodal_chat_transformation(
self, prompt: str, mm_content: Optional[MultiModalDataDict] = None self,
) -> list[dict]: prompt: str,
mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
""" """
Transform a prompt and optional multimodal content into a chat format. Transform a prompt and optional multimodal content into a chat format.
This method is used for chat models that expect a specific conversation This method is used for chat models that expect a specific conversation
@ -109,7 +109,8 @@ class BenchmarkDataset(ABC):
NotImplementedError: If a subclass does not implement this method. NotImplementedError: If a subclass does not implement this method.
""" """
# TODO (jenniferzhao): add support for downloading data # TODO (jenniferzhao): add support for downloading data
raise NotImplementedError("load_data must be implemented in subclasses.") raise NotImplementedError(
"load_data must be implemented in subclasses.")
def get_random_lora_request( def get_random_lora_request(
self, self,
@ -155,9 +156,8 @@ class BenchmarkDataset(ABC):
return lora_request, lora_tokenizer_cache[lora_id] or tokenizer return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
@abstractmethod @abstractmethod
def sample( def sample(self, tokenizer: PreTrainedTokenizerBase,
self, tokenizer: PreTrainedTokenizerBase, num_requests: int num_requests: int) -> list[SampleRequest]:
) -> list[SampleRequest]:
""" """
Abstract method to generate sample requests from the dataset. Abstract method to generate sample requests from the dataset.
@ -175,9 +175,8 @@ class BenchmarkDataset(ABC):
""" """
raise NotImplementedError("sample must be implemented in subclasses.") raise NotImplementedError("sample must be implemented in subclasses.")
def maybe_oversample_requests( def maybe_oversample_requests(self, requests: list[SampleRequest],
self, requests: list[SampleRequest], num_requests: int num_requests: int) -> None:
) -> None:
""" """
Oversamples the list of requests if its size is less than the desired Oversamples the list of requests if its size is less than the desired
number. number.
@ -188,9 +187,11 @@ class BenchmarkDataset(ABC):
""" """
if len(requests) < num_requests: if len(requests) < num_requests:
random.seed(self.random_seed) random.seed(self.random_seed)
additional = random.choices(requests, k=num_requests - len(requests)) additional = random.choices(requests,
k=num_requests - len(requests))
requests.extend(additional) requests.extend(additional)
logger.info("Oversampled requests to reach %d total samples.", num_requests) logger.info("Oversampled requests to reach %d total samples.",
num_requests)
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@ -215,14 +216,14 @@ def is_valid_sequence(
""" """
# Check for invalid conditions # Check for invalid conditions
prompt_too_short = prompt_len < min_len prompt_too_short = prompt_len < min_len
output_too_short = (not skip_min_output_len_check) and (output_len < min_len) output_too_short = (not skip_min_output_len_check) and (output_len
< min_len)
prompt_too_long = prompt_len > max_prompt_len prompt_too_long = prompt_len > max_prompt_len
combined_too_long = (prompt_len + output_len) > max_total_len combined_too_long = (prompt_len + output_len) > max_total_len
# Return True if none of the invalid conditions are met # Return True if none of the invalid conditions are met
return not ( return not (prompt_too_short or output_too_short or prompt_too_long
prompt_too_short or output_too_short or prompt_too_long or combined_too_long or combined_too_long)
)
@cache @cache
@ -238,44 +239,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
""" """
Process a single image input and return a multimedia content dictionary. Process a single image input and return a multimedia content dictionary.
Supports three input types: For a PIL.Image.Image input:
- Converts the image to RGB.
- Saves the image as a JPEG in-memory.
- Encodes the JPEG data as a base64 string.
- Returns a dictionary with the image as a base64 data URL.
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key For a string input:
containing raw image data. - Loads the bytes as a PIL.Image.Image. - Treats the string as a URL or file path.
- Prepends "file://" if the string doesn't start with "http://" or
2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as "file://".
a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns - Returns a dictionary with the image URL.
a dictionary with the image as a base64 data URL.
3. String input: - Treats the string as a URL or local file path. -
Prepends "file://" if the string doesn't start with "http://" or
"file://". - Returns a dictionary with the image URL.
Raises: Raises:
ValueError: If the input is not a supported type. ValueError: If the input is neither a PIL.Image.Image nor a string.
""" """
if isinstance(image, dict) and "bytes" in image:
image = Image.open(BytesIO(image["bytes"]))
if isinstance(image, Image.Image): if isinstance(image, Image.Image):
image = image.convert("RGB") image = image.convert("RGB")
with io.BytesIO() as image_data: with io.BytesIO() as image_data:
image.save(image_data, format="JPEG") image.save(image_data, format="JPEG")
image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") image_base64 = base64.b64encode(
image_data.getvalue()).decode("utf-8")
return { return {
"type": "image_url", "type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}, "image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
},
} }
if isinstance(image, str): if isinstance(image, str):
image_url = ( image_url = (image if image.startswith(
image if image.startswith(("http://", "file://")) else f"file://{image}" ("http://", "file://")) else f"file://{image}")
)
return {"type": "image_url", "image_url": {"url": image_url}} return {"type": "image_url", "image_url": {"url": image_url}}
raise ValueError( raise ValueError(
f"Invalid image input {image}. Must be a PIL.Image.Image" f"Invalid image input {image}. Must be a PIL.Image.Image or str.")
" or str or dictionary with raw image bytes."
)
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@ -286,7 +284,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
class RandomDataset(BenchmarkDataset): class RandomDataset(BenchmarkDataset):
# Default values copied from benchmark_serving.py for the random dataset. # Default values copied from benchmark_serving.py for the random dataset.
DEFAULT_PREFIX_LEN = 0 DEFAULT_PREFIX_LEN = 0
DEFAULT_RANGE_RATIO = 0.0 DEFAULT_RANGE_RATIO = 1.0
DEFAULT_INPUT_LEN = 1024 DEFAULT_INPUT_LEN = 1024
DEFAULT_OUTPUT_LEN = 128 DEFAULT_OUTPUT_LEN = 128
@ -306,62 +304,35 @@ class RandomDataset(BenchmarkDataset):
output_len: int = DEFAULT_OUTPUT_LEN, output_len: int = DEFAULT_OUTPUT_LEN,
**kwargs, **kwargs,
) -> list[SampleRequest]: ) -> list[SampleRequest]:
# Enforce range_ratio < 1
assert range_ratio < 1.0, (
"random_range_ratio must be < 1.0 to ensure a valid sampling range"
)
vocab_size = tokenizer.vocab_size vocab_size = tokenizer.vocab_size
num_special_tokens = tokenizer.num_special_tokens_to_add()
real_input_len = input_len - num_special_tokens
prefix_token_ids = ( prefix_token_ids = (np.random.randint(
np.random.randint(0, vocab_size, size=prefix_len).tolist() 0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
if prefix_len > 0
else []
)
# New sampling logic: [X * (1 - b), X * (1 + b)] input_low = int(input_len * range_ratio)
input_low = int(real_input_len * (1 - range_ratio)) output_low = int(output_len * range_ratio)
input_high = int(real_input_len * (1 + range_ratio))
output_low = int(output_len * (1 - range_ratio))
output_high = int(output_len * (1 + range_ratio))
# Add logging for debugging input_lens = np.random.randint(input_low,
logger.info("Sampling input_len from [%s, %s]", input_low, input_high) input_len + 1,
logger.info("Sampling output_len from [%s, %s]", output_low, output_high) size=num_requests)
output_lens = np.random.randint(output_low,
input_lens = np.random.randint(input_low, input_high + 1, size=num_requests) output_len + 1,
output_lens = np.random.randint(output_low, output_high + 1, size=num_requests) size=num_requests)
offsets = np.random.randint(0, vocab_size, size=num_requests) offsets = np.random.randint(0, vocab_size, size=num_requests)
requests = [] requests = []
for i in range(num_requests): for i in range(num_requests):
inner_seq = ( inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) %
(offsets[i] + i + np.arange(input_lens[i])) % vocab_size vocab_size).tolist()
).tolist()
token_sequence = prefix_token_ids + inner_seq token_sequence = prefix_token_ids + inner_seq
prompt = tokenizer.decode(token_sequence) prompt = tokenizer.decode(token_sequence)
# After decoding the prompt we have to encode and decode it again.
# This is done because in some cases N consecutive tokens
# give a string tokenized into != N number of tokens.
# For example for GPT2Tokenizer:
# [6880, 6881] -> ['Ġcalls', 'here'] ->
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
# To avoid uncontrolled change of the prompt length,
# the encoded sequence is truncated before being decode again.
re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
: input_lens[i]
]
prompt = tokenizer.decode(re_encoded_sequence)
total_input_len = prefix_len + int(input_lens[i]) total_input_len = prefix_len + int(input_lens[i])
requests.append( requests.append(
SampleRequest( SampleRequest(
prompt=prompt, prompt=prompt,
prompt_len=total_input_len, prompt_len=total_input_len,
expected_output_len=int(output_lens[i]), expected_output_len=int(output_lens[i]),
) ))
)
return requests return requests
@ -388,8 +359,7 @@ class ShareGPTDataset(BenchmarkDataset):
self.data = json.load(f) self.data = json.load(f)
# Filter entries with at least two conversation turns. # Filter entries with at least two conversation turns.
self.data = [ self.data = [
entry entry for entry in self.data
for entry in self.data
if "conversations" in entry and len(entry["conversations"]) >= 2 if "conversations" in entry and len(entry["conversations"]) >= 2
] ]
random.seed(self.random_seed) random.seed(self.random_seed)
@ -415,28 +385,27 @@ class ShareGPTDataset(BenchmarkDataset):
) )
lora_request, tokenizer = self.get_random_lora_request( lora_request, tokenizer = self.get_random_lora_request(
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
)
prompt_ids = tokenizer(prompt).input_ids prompt_ids = tokenizer(prompt).input_ids
completion_ids = tokenizer(completion).input_ids completion_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_ids) prompt_len = len(prompt_ids)
new_output_len = len(completion_ids) if output_len is None else output_len new_output_len = (len(completion_ids)
if not is_valid_sequence( if output_len is None else output_len)
prompt_len, if not is_valid_sequence(prompt_len,
new_output_len, new_output_len,
skip_min_output_len_check=output_len is not None, skip_min_output_len_check=output_len
): is not None):
continue continue
if enable_multimodal_chat: if enable_multimodal_chat:
prompt = self.apply_multimodal_chat_transformation(prompt, None) prompt = self.apply_multimodal_chat_transformation(
prompt, None)
samples.append( samples.append(
SampleRequest( SampleRequest(
prompt=prompt, prompt=prompt,
prompt_len=prompt_len, prompt_len=prompt_len,
expected_output_len=new_output_len, expected_output_len=new_output_len,
lora_request=lora_request, lora_request=lora_request,
) ))
)
self.maybe_oversample_requests(samples, num_requests) self.maybe_oversample_requests(samples, num_requests)
return samples return samples
@ -482,45 +451,42 @@ class SonnetDataset(BenchmarkDataset):
) -> list: ) -> list:
# Calculate average token length for a poem line. # Calculate average token length for a poem line.
tokenized_lines = [tokenizer(line).input_ids for line in self.data] tokenized_lines = [tokenizer(line).input_ids for line in self.data]
avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines) avg_len = sum(len(tokens)
for tokens in tokenized_lines) / len(tokenized_lines)
# Build the base prompt. # Build the base prompt.
base_prompt = "Pick as many lines as you can from these poem lines:\n" base_prompt = "Pick as many lines as you can from these poem lines:\n"
base_msg = [{"role": "user", "content": base_prompt}] base_msg = [{"role": "user", "content": base_prompt}]
base_fmt = tokenizer.apply_chat_template( base_fmt = tokenizer.apply_chat_template(base_msg,
base_msg, add_generation_prompt=True, tokenize=False add_generation_prompt=True,
) tokenize=False)
base_offset = len(tokenizer(base_fmt).input_ids) base_offset = len(tokenizer(base_fmt).input_ids)
if input_len <= base_offset: if input_len <= base_offset:
raise ValueError( raise ValueError(
f"'input_len' must be higher than the base prompt length " f"'input_len' must be higher than the base prompt length "
f"({base_offset})." f"({base_offset}).")
)
# Determine how many poem lines to use. # Determine how many poem lines to use.
num_input_lines = round((input_len - base_offset) / avg_len) num_input_lines = round((input_len - base_offset) / avg_len)
num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0) num_prefix_lines = round((prefix_len - base_offset) / avg_len)
prefix_lines = self.data[:num_prefix_lines] prefix_lines = self.data[:num_prefix_lines]
samples = [] samples = []
while len(samples) < num_requests: for _ in range(num_requests):
extra_lines = random.choices( extra_lines = random.choices(self.data,
self.data, k=num_input_lines - num_prefix_lines k=num_input_lines - num_prefix_lines)
)
prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}" prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
msg = [{"role": "user", "content": prompt}] msg = [{"role": "user", "content": prompt}]
prompt_formatted = tokenizer.apply_chat_template( prompt_formatted = tokenizer.apply_chat_template(
msg, add_generation_prompt=True, tokenize=False msg, add_generation_prompt=True, tokenize=False)
)
prompt_len = len(tokenizer(prompt_formatted).input_ids) prompt_len = len(tokenizer(prompt_formatted).input_ids)
if prompt_len <= input_len: samples.append(
samples.append( SampleRequest(
SampleRequest( prompt=prompt_formatted
prompt=prompt_formatted if return_prompt_formatted else prompt, if return_prompt_formatted else prompt,
prompt_len=prompt_len, prompt_len=prompt_len,
expected_output_len=output_len, expected_output_len=output_len,
) ))
)
return samples return samples
@ -540,9 +506,7 @@ class BurstGPTDataset(BenchmarkDataset):
super().__init__(**kwargs) super().__init__(**kwargs)
self.load_data() self.load_data()
def load_data( def load_data(self, ):
self,
):
if self.dataset_path is None: if self.dataset_path is None:
raise ValueError("dataset_path must be provided for loading data.") raise ValueError("dataset_path must be provided for loading data.")
@ -556,7 +520,8 @@ class BurstGPTDataset(BenchmarkDataset):
def _sample_loaded_data(self, num_requests: int) -> list: def _sample_loaded_data(self, num_requests: int) -> list:
if num_requests <= len(self.data): if num_requests <= len(self.data):
data = self.data.sample(n=num_requests, random_state=self.random_seed) data = self.data.sample(n=num_requests,
random_state=self.random_seed)
else: else:
data = self.data.sample( data = self.data.sample(
n=num_requests, n=num_requests,
@ -580,8 +545,7 @@ class BurstGPTDataset(BenchmarkDataset):
input_len = int(data[i][2]) input_len = int(data[i][2])
output_len = int(data[i][3]) output_len = int(data[i][3])
lora_req, tokenizer = self.get_random_lora_request( lora_req, tokenizer = self.get_random_lora_request(
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
)
vocab_size = tokenizer.vocab_size vocab_size = tokenizer.vocab_size
# Generate a synthetic prompt: a list of token IDs computed as (i + # Generate a synthetic prompt: a list of token IDs computed as (i +
# j) modulo vocab_size. # j) modulo vocab_size.
@ -593,71 +557,64 @@ class BurstGPTDataset(BenchmarkDataset):
prompt_len=input_len, prompt_len=input_len,
expected_output_len=output_len, expected_output_len=output_len,
lora_request=lora_req, lora_request=lora_req,
) ))
)
return samples return samples
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# HuggingFace Dataset Base Implementation # HuggingFace Dataset Implementation
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
class HuggingFaceDataset(BenchmarkDataset):
"""Base class for datasets hosted on HuggingFace."""
SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
class HuggingFaceDataset(BenchmarkDataset):
"""
Dataset class for processing a HuggingFace dataset with conversation data
and optional images.
"""
def __init__( def __init__(
self, self,
dataset_path: str,
dataset_split: str, dataset_split: str,
dataset_subset: Optional[str] = None, dataset_subset: Optional[str] = None,
**kwargs, **kwargs,
) -> None: ) -> None:
super().__init__(dataset_path=dataset_path, **kwargs) super().__init__(**kwargs)
self.dataset_split = dataset_split self.dataset_split = dataset_split
self.dataset_subset = dataset_subset self.dataset_subset = dataset_subset
self.load_data() self.load_data()
def load_data(self) -> None: def load_data(self) -> None:
"""Load data from HuggingFace datasets.""" if not self.dataset_path:
raise ValueError("dataset_path must be provided for loading data.")
self.data = load_dataset( self.data = load_dataset(
self.dataset_path, self.dataset_path,
name=self.dataset_subset, name=self.dataset_subset,
split=self.dataset_split, split=self.dataset_split,
streaming=True, streaming=True,
) )
self.data = self.data.shuffle(seed=self.random_seed) if self.data.features is None or "conversations" \
not in self.data.features:
raise ValueError(
"HuggingFaceDataset currently only supports datasets with "
"a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
"Please consider contributing if you would like to add "
"support for additional dataset formats.")
# Shuffle and filter examples with at least 2 conversations.
self.data = self.data.shuffle(seed=self.random_seed).filter(
lambda x: len(x["conversations"]) >= 2)
def sample(self,
# ----------------------------------------------------------------------------- tokenizer: PreTrainedTokenizerBase,
# Conversation Dataset Implementation num_requests: int,
# ----------------------------------------------------------------------------- output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list:
class ConversationDataset(HuggingFaceDataset):
"""Dataset for conversation data with multimodal support."""
SUPPORTED_DATASET_PATHS = {
"lmms-lab/LLaVA-OneVision-Data",
"Aeala/ShareGPT_Vicuna_unfiltered",
}
IS_MULTIMODAL = True
def sample(
self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs,
) -> list:
# Filter examples with at least 2 conversations
filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
sampled_requests = [] sampled_requests = []
dynamic_output = output_len is None dynamic_output = output_len is None
for item in filtered_data: for item in self.data:
if len(sampled_requests) >= num_requests: if len(sampled_requests) >= num_requests:
break break
conv = item["conversations"] conv = item["conversations"]
@ -669,22 +626,24 @@ class ConversationDataset(HuggingFaceDataset):
completion_len = len(completion_ids) completion_len = len(completion_ids)
output_len = completion_len if dynamic_output else output_len output_len = completion_len if dynamic_output else output_len
assert isinstance(output_len, int) and output_len > 0 assert isinstance(output_len, int) and output_len > 0
if dynamic_output and not is_valid_sequence(prompt_len, completion_len): if dynamic_output and not is_valid_sequence(
prompt_len, completion_len):
continue continue
mm_content = process_image(item["image"]) if "image" in item else None mm_content = process_image(
item["image"]) if "image" in item else None
if enable_multimodal_chat: if enable_multimodal_chat:
# Note: when chat is enabled the request prompt_len is no longer # Note: when chat is enabled the request prompt_len is no longer
# accurate and we will be using request output to count the # accurate and we will be using request output to count the
# actual prompt len and output len # actual prompt len and output len
prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) prompt = self.apply_multimodal_chat_transformation(
prompt, mm_content)
sampled_requests.append( sampled_requests.append(
SampleRequest( SampleRequest(
prompt=prompt, prompt=prompt,
prompt_len=prompt_len, prompt_len=prompt_len,
expected_output_len=output_len, expected_output_len=output_len,
multi_modal_data=mm_content, multi_modal_data=mm_content,
) ))
)
self.maybe_oversample_requests(sampled_requests, num_requests) self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests return sampled_requests
@ -700,11 +659,29 @@ class VisionArenaDataset(HuggingFaceDataset):
""" """
DEFAULT_OUTPUT_LEN = 128 DEFAULT_OUTPUT_LEN = 128
SUPPORTED_DATASET_PATHS = { VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
"lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"],
"lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"], def __init__(
} self,
IS_MULTIMODAL = True **kwargs,
) -> None:
super().__init__(**kwargs)
if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
raise ValueError(f"Only support Vision Arena dataset.\
This data path {self.dataset_path} is not valid.")
if self.dataset_subset is None and self.dataset_split != "train":
raise ValueError("Dataset split must be 'train'.")
self.load_data()
def load_data(self) -> None:
dataset = load_dataset(
self.dataset_path,
name=self.dataset_subset,
split=self.dataset_split,
streaming=True,
)
self.data = dataset.shuffle(seed=self.random_seed)
def sample( def sample(
self, self,
@ -714,356 +691,27 @@ class VisionArenaDataset(HuggingFaceDataset):
enable_multimodal_chat: bool = False, enable_multimodal_chat: bool = False,
**kwargs, **kwargs,
) -> list: ) -> list:
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN output_len = (output_len
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
sampled_requests = [] sampled_requests = []
for item in self.data: for item in self.data:
if len(sampled_requests) >= num_requests: if len(sampled_requests) >= num_requests:
break break
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path) prompt = item["turns"][0][0]["content"]
if parser_fn is None:
raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
prompt = parser_fn(item)
mm_content = process_image(item["images"][0]) mm_content = process_image(item["images"][0])
prompt_len = len(tokenizer(prompt).input_ids) prompt_len = len(tokenizer(prompt).input_ids)
if enable_multimodal_chat: if enable_multimodal_chat:
# Note: when chat is enabled the request prompt_len is no longer # Note: when chat is enabled the request prompt_len is no longer
# accurate and we will be using request output to count the # accurate and we will be using request output to count the
# actual prompt len # actual prompt len
prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) prompt = self.apply_multimodal_chat_transformation(
prompt, mm_content)
sampled_requests.append( sampled_requests.append(
SampleRequest( SampleRequest(
prompt=prompt, prompt=prompt,
prompt_len=prompt_len, prompt_len=prompt_len,
expected_output_len=output_len, expected_output_len=output_len,
multi_modal_data=mm_content, multi_modal_data=mm_content,
) ))
)
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests
# -----------------------------------------------------------------------------
# Instruct Coder Dataset Implementation
# -----------------------------------------------------------------------------
class InstructCoderDataset(HuggingFaceDataset):
"""
InstructCoder Dataset.
https://huggingface.co/datasets/likaixin/InstructCoder
InstructCoder is the dataset designed for general code editing. It consists
of 114,239 instruction-input-output triplets, and covers multiple distinct
code editing scenario.
"""
DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
SUPPORTED_DATASET_PATHS = {
"likaixin/InstructCoder",
}
def sample(
self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs,
) -> list:
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
sampled_requests = []
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt = f"{item['instruction']}:\n{item['input']}"
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
)
)
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests
# -----------------------------------------------------------------------------
# MT-Bench Dataset Implementation
# -----------------------------------------------------------------------------
class MTBenchDataset(HuggingFaceDataset):
"""
MT-Bench Dataset.
https://huggingface.co/datasets/philschmid/mt-bench
We create a single turn dataset for MT-Bench.
This is similar to Spec decoding benchmark setup in vLLM
https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
""" # noqa: E501
DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM
SUPPORTED_DATASET_PATHS = {
"philschmid/mt-bench",
}
def sample(
self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs,
) -> list:
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
sampled_requests = []
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt = item["turns"][0]
# apply template
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
)
)
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests
# -----------------------------------------------------------------------------
# AIMO Dataset Implementation
# -----------------------------------------------------------------------------
class AIMODataset(HuggingFaceDataset):
"""
Dataset class for processing a AIMO dataset with reasoning questions.
"""
SUPPORTED_DATASET_PATHS = {
"AI-MO/aimo-validation-aime",
"AI-MO/NuminaMath-1.5",
"AI-MO/NuminaMath-CoT",
}
def sample(
self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
**kwargs,
) -> list:
sampled_requests = []
dynamic_output = output_len is None
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt, completion = item["problem"], item["solution"]
prompt_ids = tokenizer(prompt).input_ids
completion_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_ids)
completion_len = len(completion_ids)
output_len = completion_len if dynamic_output else output_len
assert isinstance(output_len, int) and output_len > 0
if dynamic_output and not is_valid_sequence(
prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000
):
continue
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=None,
)
)
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests
# -----------------------------------------------------------------------------
# Next Edit Prediction Dataset Implementation
# -----------------------------------------------------------------------------
zeta_prompt = """### Instruction:
You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
### User Edits:
{}
### User Excerpt:
{}
### Response:
""" # noqa: E501
def _format_zeta_prompt(
sample: dict, original_start_marker: str = "<|editable_region_start|>"
) -> dict:
"""Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
This function formats examples from the NEP dataset
into prompts and expected outputs. It could be
further extended to support more NEP datasets.
Args:
sample: The dataset sample containing events,
inputs, and outputs.
original_start_marker: The marker indicating the
start of the editable region. Defaults to
"<|editable_region_start|>".
Returns:
A dictionary with the formatted prompts and expected outputs.
"""
events = sample["events"]
input = sample["input"]
output = sample["output"]
prompt = zeta_prompt.format(events, input)
# following the original implementation, extract the focused region
# from the raw output
output_start_index = output.find(original_start_marker)
output_focused_region = output[output_start_index:]
expected_output = output_focused_region
return {"prompt": prompt, "expected_output": expected_output}
class NextEditPredictionDataset(HuggingFaceDataset):
"""
Dataset class for processing a Next Edit Prediction dataset.
"""
SUPPORTED_DATASET_PATHS = {
"zed-industries/zeta",
}
MAPPING_PROMPT_FUNCS = {
"zed-industries/zeta": _format_zeta_prompt,
}
def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, **kwargs):
formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path)
if formatting_prompt_func is None:
raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
samples = []
for sample in self.data:
sample = formatting_prompt_func(sample)
samples.append(
SampleRequest(
prompt=sample["prompt"],
prompt_len=len(tokenizer(sample["prompt"]).input_ids),
expected_output_len=len(
tokenizer(sample["expected_output"]).input_ids
),
)
)
if len(samples) >= num_requests:
break
self.maybe_oversample_requests(samples, num_requests)
return samples
# -----------------------------------------------------------------------------
# ASR Dataset Implementation
# -----------------------------------------------------------------------------
class ASRDataset(HuggingFaceDataset):
"""
Dataset class for processing a ASR dataset for transcription.
Tested on the following set:
+----------------+----------------------------------------+--------------------------+-----------------------------+
| Dataset | Domain | Speaking Style | hf-subset |
+----------------+----------------------------------------+--------------------------+-----------------------------+
| TED-LIUM | TED talks | Oratory | release1, release2, release3|
| | | | release3-speaker-adaptation |
| VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... |
| LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" |
| GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test |
| SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test |
| AMI | Meetings | Spontaneous | ihm, sdm |
+----------------+----------------------------------------+--------------------------+-----------------------------+
""" # noqa: E501
SUPPORTED_DATASET_PATHS = {
"openslr/librispeech_asr",
"facebook/voxpopuli",
"LIUM/tedlium",
"edinburghcstr/ami",
"speechcolab/gigaspeech",
"kensho/spgispeech",
}
DEFAULT_OUTPUT_LEN = 128
IS_MULTIMODAL = True
# TODO Whisper-specific. Abstract interface when more models are supported.
TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
skip_long_audios: bool = True
def sample(
self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
**kwargs,
) -> list:
import librosa
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests = []
skipped = 0
for item in self.data:
if len(sampled_requests) >= num_requests:
break
audio = item["audio"]
y, sr = audio["array"], audio["sampling_rate"]
duration_s = librosa.get_duration(y=y, sr=sr)
# Whisper max supported duration
if self.skip_long_audios and duration_s > 30:
skipped += 1
continue
mm_content = {"audio": (y, sr)}
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=mm_content,
)
)
if skipped:
logger.warning(
"%d samples discarded from dataset due to"
" their length being greater than"
" what Whisper supports.",
skipped,
)
self.maybe_oversample_requests(sampled_requests, num_requests) self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests return sampled_requests

View File

@ -11,9 +11,9 @@ from typing import Any, Optional
import numpy as np import numpy as np
import torch import torch
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from tqdm import tqdm from tqdm import tqdm
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptType from vllm.inputs import PromptType
@ -21,14 +21,13 @@ from vllm.sampling_params import BeamSearchParams
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
def save_to_pytorch_benchmark_format( def save_to_pytorch_benchmark_format(args: argparse.Namespace,
args: argparse.Namespace, results: dict[str, Any] results: dict[str, Any]) -> None:
) -> None:
pt_records = convert_to_pytorch_benchmark_format( pt_records = convert_to_pytorch_benchmark_format(
args=args, args=args,
metrics={"latency": results["latencies"]}, metrics={"latency": results["latencies"]},
extra_info={k: results[k] for k in ["avg_latency", "percentiles"]}, extra_info={k: results[k]
) for k in ["avg_latency", "percentiles"]})
if pt_records: if pt_records:
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
write_to_json(pt_file, pt_records) write_to_json(pt_file, pt_records)
@ -43,11 +42,9 @@ def main(args: argparse.Namespace):
# the engine will automatically process the request in multiple batches. # the engine will automatically process the request in multiple batches.
llm = LLM(**dataclasses.asdict(engine_args)) llm = LLM(**dataclasses.asdict(engine_args))
assert llm.llm_engine.model_config.max_model_len >= ( assert llm.llm_engine.model_config.max_model_len >= (
args.input_len + args.output_len args.input_len +
), ( args.output_len), ("Please ensure that max_model_len is greater than"
"Please ensure that max_model_len is greater than" " the sum of input_len and output_len.")
" the sum of input_len and output_len."
)
sampling_params = SamplingParams( sampling_params = SamplingParams(
n=args.n, n=args.n,
@ -58,16 +55,18 @@ def main(args: argparse.Namespace):
detokenize=not args.disable_detokenize, detokenize=not args.disable_detokenize,
) )
print(sampling_params) print(sampling_params)
dummy_prompt_token_ids = np.random.randint( dummy_prompt_token_ids = np.random.randint(10000,
10000, size=(args.batch_size, args.input_len) size=(args.batch_size,
) args.input_len))
dummy_prompts: list[PromptType] = [ dummy_prompts: list[PromptType] = [{
{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist() "prompt_token_ids": batch
] } for batch in dummy_prompt_token_ids.tolist()]
def llm_generate(): def llm_generate():
if not args.use_beam_search: if not args.use_beam_search:
llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) llm.generate(dummy_prompts,
sampling_params=sampling_params,
use_tqdm=False)
else: else:
llm.beam_search( llm.beam_search(
dummy_prompts, dummy_prompts,
@ -81,13 +80,12 @@ def main(args: argparse.Namespace):
def run_to_completion(profile_dir: Optional[str] = None): def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir: if profile_dir:
with torch.profiler.profile( with torch.profiler.profile(
activities=[ activities=[
torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA, torch.profiler.ProfilerActivity.CUDA,
], ],
on_trace_ready=torch.profiler.tensorboard_trace_handler( on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir) str(profile_dir)),
),
) as p: ) as p:
llm_generate() llm_generate()
print(p.key_averages().table(sort_by="self_cuda_time_total")) print(p.key_averages().table(sort_by="self_cuda_time_total"))
@ -105,9 +103,8 @@ def main(args: argparse.Namespace):
if args.profile: if args.profile:
profile_dir = args.profile_result_dir profile_dir = args.profile_result_dir
if not profile_dir: if not profile_dir:
profile_dir = ( profile_dir = (Path(".") / "vllm_benchmark_result" /
Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}" f"latency_result_{time.time()}")
)
print(f"Profiling (results will be saved to '{profile_dir}')...") print(f"Profiling (results will be saved to '{profile_dir}')...")
run_to_completion(profile_dir=profile_dir) run_to_completion(profile_dir=profile_dir)
return return
@ -138,8 +135,7 @@ def main(args: argparse.Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the latency of processing a single batch of " description="Benchmark the latency of processing a single batch of "
"requests till completion." "requests till completion.")
)
parser.add_argument("--input-len", type=int, default=32) parser.add_argument("--input-len", type=int, default=32)
parser.add_argument("--output-len", type=int, default=128) parser.add_argument("--output-len", type=int, default=128)
parser.add_argument("--batch-size", type=int, default=8) parser.add_argument("--batch-size", type=int, default=8)
@ -156,9 +152,10 @@ if __name__ == "__main__":
default=10, default=10,
help="Number of iterations to run for warmup.", help="Number of iterations to run for warmup.",
) )
parser.add_argument( parser.add_argument("--num-iters",
"--num-iters", type=int, default=30, help="Number of iterations to run." type=int,
) default=30,
help="Number of iterations to run.")
parser.add_argument( parser.add_argument(
"--profile", "--profile",
action="store_true", action="store_true",
@ -168,10 +165,8 @@ if __name__ == "__main__":
"--profile-result-dir", "--profile-result-dir",
type=str, type=str,
default=None, default=None,
help=( help=("path to save the pytorch profiler output. Can be visualized "
"path to save the pytorch profiler output. Can be visualized " "with ui.perfetto.dev or Tensorboard."),
"with ui.perfetto.dev or Tensorboard."
),
) )
parser.add_argument( parser.add_argument(
"--output-json", "--output-json",
@ -182,10 +177,8 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--disable-detokenize", "--disable-detokenize",
action="store_true", action="store_true",
help=( help=("Do not detokenize responses (i.e. do not include "
"Do not detokenize responses (i.e. do not include " "detokenization time in the latency measurement)"),
"detokenization time in the latency measurement)"
),
) )
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)

View File

@ -76,7 +76,7 @@ def repeat_prompts(prompts, repeat_count, mode: str):
- 'random': Shuffle the prompts randomly after repetition. - 'random': Shuffle the prompts randomly after repetition.
- 'tile': Repeat the entire prompt list in sequence. - 'tile': Repeat the entire prompt list in sequence.
Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3]. Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
- 'interleave': Repeat each prompt consecutively before moving to - 'interleave': Repeat each prompt consecutively before moving to
the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3]. the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
Returns: Returns:
@ -86,21 +86,20 @@ def repeat_prompts(prompts, repeat_count, mode: str):
ValueError: If an invalid mode is provided. ValueError: If an invalid mode is provided.
""" """
print("Repeat mode: ", mode) print("Repeat mode: ", mode)
if mode == "random": if mode == 'random':
repeated_prompts = prompts * repeat_count repeated_prompts = prompts * repeat_count
random.shuffle(repeated_prompts) random.shuffle(repeated_prompts)
return repeated_prompts return repeated_prompts
elif mode == "tile": elif mode == 'tile':
return prompts * repeat_count return prompts * repeat_count
elif mode == "interleave": elif mode == 'interleave':
repeated_prompts = [] repeated_prompts = []
for prompt in prompts: for prompt in prompts:
repeated_prompts.extend([prompt] * repeat_count) repeated_prompts.extend([prompt] * repeat_count)
return repeated_prompts return repeated_prompts
else: else:
raise ValueError( raise ValueError(f"Invalid mode: {mode}, only support "
f"Invalid mode: {mode}, only support 'random', 'tile', 'interleave'" "'random', 'tile', 'interleave'")
)
def main(args): def main(args):
@ -110,16 +109,16 @@ def main(args):
# we append the document id at the beginning to avoid any of the document # we append the document id at the beginning to avoid any of the document
# being the prefix of other documents # being the prefix of other documents
prompts = [ prompts = [
str(i) + " ".join(["hi"] * args.document_length) str(i) + ' '.join(['hi'] * args.document_length)
for i in range(args.num_documents) for i in range(args.num_documents)
] ]
prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode) prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
warmup_prompts = [ warmup_prompts = [
"This is warm up request " + str(i) + " ".join(["hi"] * args.document_length) "This is warm up request " + str(i) + \
for i in range(args.num_documents) ' '.join(['hi'] * args.document_length)
] for i in range(args.num_documents)]
# Create the LLM engine # Create the LLM engine
engine_args = EngineArgs.from_cli_args(args) engine_args = EngineArgs.from_cli_args(args)
@ -143,52 +142,42 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the performance with or " description=
"without automatic prefix caching." 'Benchmark the performance with or without automatic prefix caching.')
)
parser.add_argument( parser.add_argument(
"--document-length", '--document-length',
type=int, type=int,
# Roughly the number of tokens for a system paper, # Roughly the number of tokens for a system paper,
# excluding images # excluding images
default=20000, default=20000,
help="Range of input lengths for sampling prompts, " help='Range of input lengths for sampling prompts,'
'specified as "min:max" (e.g., "128:256").', 'specified as "min:max" (e.g., "128:256").')
)
parser.add_argument( parser.add_argument('--num-documents',
"--num-documents", type=int,
type=int, default=8,
default=8, help='Range of input lengths for sampling prompts,'
help="Range of input lengths for sampling prompts, " 'specified as "min:max" (e.g., "128:256").')
'specified as "min:max" (e.g., "128:256").',
)
parser.add_argument("--output-len", type=int, default=10) parser.add_argument('--output-len', type=int, default=10)
parser.add_argument( parser.add_argument('--repeat-count',
"--repeat-count", type=int,
type=int, default=2,
default=2, help='Number of times to repeat each prompt')
help="Number of times to repeat each prompt",
)
parser.add_argument( parser.add_argument("--repeat-mode",
"--repeat-mode", type=str,
type=str, default='random',
default="random", help='The mode to repeat prompts. The supported '
help="The mode to repeat prompts. The supported " 'modes are "random", "tile", and "interleave". '
'modes are "random", "tile", and "interleave". ' 'See repeat_prompts() in the source code for details.')
"See repeat_prompts() in the source code for details.",
)
parser.add_argument( parser.add_argument("--shuffle-seed",
"--shuffle-seed", type=int,
type=int, default=0,
default=0, help='Random seed when the repeat mode is "random"')
help='Random seed when the repeat mode is "random"',
)
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args() args = parser.parse_args()

View File

@ -63,15 +63,14 @@ class Request:
output_len: int output_len: int
def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]: def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
vocab = tokenizer.get_vocab() vocab = tokenizer.get_vocab()
all_special_ids = set(tokenizer.all_special_ids)
# Remove the special tokens. # Remove the special tokens.
return random.choices( vocab = {
[v for k, v in vocab.items() if k not in all_special_ids], k: v
k=length, for k, v in vocab.items() if k not in tokenizer.all_special_ids
) }
return random.choices(list(vocab.values()), k=length)
def sample_requests_from_dataset( def sample_requests_from_dataset(
@ -90,10 +89,8 @@ def sample_requests_from_dataset(
# Filter out the conversations with less than 2 turns. # Filter out the conversations with less than 2 turns.
dataset = [data for data in dataset if len(data["conversations"]) >= 2] dataset = [data for data in dataset if len(data["conversations"]) >= 2]
# Only keep the first two turns of each conversation. # Only keep the first two turns of each conversation.
dataset = [ dataset = [(data["conversations"][0]["value"],
(data["conversations"][0]["value"], data["conversations"][1]["value"]) data["conversations"][1]["value"]) for data in dataset]
for data in dataset
]
# Shuffle the dataset. # Shuffle the dataset.
random.shuffle(dataset) random.shuffle(dataset)
@ -114,9 +111,8 @@ def sample_requests_from_dataset(
completion = dataset[i][1] completion = dataset[i][1]
completion_token_ids = tokenizer(completion).input_ids completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids) prompt_len = len(prompt_token_ids)
output_len = ( output_len = (len(completion_token_ids)
len(completion_token_ids) if fixed_output_len is None else fixed_output_len if fixed_output_len is None else fixed_output_len)
)
if min_len <= prompt_len <= max_len: if min_len <= prompt_len <= max_len:
filtered_requests.append(Request(prompt, prompt_len, output_len)) filtered_requests.append(Request(prompt, prompt_len, output_len))
@ -130,27 +126,27 @@ def sample_requests_from_random(
fixed_output_len: Optional[int], fixed_output_len: Optional[int],
prefix_len: int, prefix_len: int,
) -> list[Request]: ) -> list[Request]:
requests = [] requests = []
prefix_token_ids = sample_tokens(tokenizer, prefix_len) prefix_token_ids = sample_tokens(tokenizer, prefix_len)
min_len, max_len = input_length_range min_len, max_len = input_length_range
for i in range(num_requests): for i in range(num_requests):
unique_part_token_ids = sample_tokens( unique_part_token_ids = sample_tokens(
tokenizer, random.randint(min_len - prefix_len, max_len - prefix_len) tokenizer,
) random.randint(min_len - prefix_len, max_len - prefix_len))
prompt_token_ids = prefix_token_ids + unique_part_token_ids prompt_token_ids = prefix_token_ids + unique_part_token_ids
prompt = tokenizer.decode(prompt_token_ids) prompt = tokenizer.decode(prompt_token_ids)
prompt_len = len(prompt_token_ids) prompt_len = len(prompt_token_ids)
assert min_len <= prompt_len <= max_len, ( assert (min_len <= prompt_len <= max_len
f"prompt_len {prompt_len} out of range {min_len}:{max_len}" ), f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
)
requests.append(Request(prompt, prompt_len, fixed_output_len)) requests.append(Request(prompt, prompt_len, fixed_output_len))
return requests return requests
def repeat_and_sort_requests( def repeat_and_sort_requests(requests: list[Request],
requests: list[Request], repeat_count: int, sort: bool = False repeat_count: int,
) -> list[str]: sort: bool = False) -> list[str]:
repeated_requests = requests * repeat_count repeated_requests = requests * repeat_count
if sort: if sort:
repeated_requests.sort(key=lambda x: x[1]) repeated_requests.sort(key=lambda x: x[1])
@ -161,14 +157,14 @@ def repeat_and_sort_requests(
def main(args): def main(args):
tokenizer = get_tokenizer(args.model, trust_remote_code=True) tokenizer = get_tokenizer(args.model, trust_remote_code=True)
input_length_range = tuple(map(int, args.input_length_range.split(":"))) input_length_range = tuple(map(int, args.input_length_range.split(':')))
random.seed(args.seed) random.seed(args.seed)
if args.dataset_path is not None: if args.dataset_path is not None:
if args.prefix_len > 0: if args.prefix_len > 0:
raise ValueError( raise ValueError("prefix-len is not supported when "
"prefix-len is not supported when dataset-path is provided." "dataset-path is provided.")
) print(f"Start to sample {args.num_prompts} prompts "
print(f"Start to sample {args.num_prompts} prompts from {args.dataset_path}") f"from {args.dataset_path}")
filtered_requests = sample_requests_from_dataset( filtered_requests = sample_requests_from_dataset(
dataset_path=args.dataset_path, dataset_path=args.dataset_path,
num_requests=args.num_prompts, num_requests=args.num_prompts,
@ -198,16 +194,14 @@ def main(args):
llm = LLM(**dataclasses.asdict(engine_args)) llm = LLM(**dataclasses.asdict(engine_args))
sampling_params = SamplingParams( sampling_params = SamplingParams(temperature=0,
temperature=0, max_tokens=args.output_len,
max_tokens=args.output_len, detokenize=not args.disable_detokenize)
detokenize=not args.disable_detokenize,
)
print("Testing filtered requests") print("Testing filtered requests")
prompts = repeat_and_sort_requests( prompts = repeat_and_sort_requests(filtered_requests,
filtered_requests, repeat_count=args.repeat_count, sort=args.sort repeat_count=args.repeat_count,
) sort=args.sort)
print("------start generating------") print("------start generating------")
test_prefix( test_prefix(
@ -219,35 +213,29 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the performance with or without " description=
"automatic prefix caching." 'Benchmark the performance with or without automatic prefix caching.')
) parser.add_argument("--dataset-path",
parser.add_argument( type=str,
"--dataset-path", type=str, default=None, help="Path to the dataset." default=None,
) help="Path to the dataset.")
parser.add_argument("--output-len", type=int, default=10) parser.add_argument('--output-len', type=int, default=10)
parser.add_argument( parser.add_argument('--num-prompts',
"--num-prompts", type=int,
type=int, required=True,
required=True, help="Number of the prompts sampled from dataset")
help="Number of the prompts sampled from dataset", parser.add_argument('--repeat-count',
) type=int,
parser.add_argument( default=1,
"--repeat-count", help='Number of times to repeat each prompt')
type=int, parser.add_argument('--sort',
default=1, action='store_true',
help="Number of times to repeat each prompt", help='Sort prompts by input length')
) parser.add_argument('--input-length-range',
parser.add_argument( type=str,
"--sort", action="store_true", help="Sort prompts by input length" required=True,
) help='Range of input lengths for sampling prompts,'
parser.add_argument( 'specified as "min:max" (e.g., "128:256").')
"--input-length-range",
type=str,
required=True,
help="Range of input lengths for sampling prompts,"
'specified as "min:max" (e.g., "128:256").',
)
parser.add_argument( parser.add_argument(
"--prefix-len", "--prefix-len",
type=int, type=int,
@ -258,12 +246,10 @@ if __name__ == "__main__":
"when dataset-path is not provided.", "when dataset-path is not provided.",
) )
parser.add_argument( parser.add_argument(
"--disable-detokenize", '--disable-detokenize',
action="store_true", action='store_true',
help=( help=("Do not detokenize responses (i.e. do not include "
"Do not detokenize responses (i.e. do not include " "detokenization time in the latency measurement)"),
"detokenization time in the latency measurement)"
),
) )
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Benchmark offline prioritization.""" """Benchmark offline prioritization."""
import argparse import argparse
import dataclasses import dataclasses
import json import json
@ -14,7 +13,7 @@ from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
# Select a equi-probable random priority #Select a equi-probable random priority
def get_random_flag(): def get_random_flag():
return 0 if random.random() < 0.5 else 1 return 0 if random.random() < 0.5 else 1
@ -34,10 +33,8 @@ def sample_requests(
# Filter out the conversations with less than 2 turns. # Filter out the conversations with less than 2 turns.
dataset = [data for data in dataset if len(data["conversations"]) >= 2] dataset = [data for data in dataset if len(data["conversations"]) >= 2]
# Only keep the first two turns of each conversation. # Only keep the first two turns of each conversation.
dataset = [ dataset = [(data["conversations"][0]["value"],
(data["conversations"][0]["value"], data["conversations"][1]["value"]) data["conversations"][1]["value"]) for data in dataset]
for data in dataset
]
# Shuffle the dataset. # Shuffle the dataset.
random.shuffle(dataset) random.shuffle(dataset)
@ -54,9 +51,8 @@ def sample_requests(
completion = dataset[i][1] completion = dataset[i][1]
completion_token_ids = tokenizer(completion).input_ids completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids) prompt_len = len(prompt_token_ids)
output_len = ( output_len = len(completion_token_ids
len(completion_token_ids) if fixed_output_len is None else fixed_output_len ) if fixed_output_len is None else fixed_output_len
)
if prompt_len < 4 or output_len < 4: if prompt_len < 4 or output_len < 4:
# Prune too short sequences. # Prune too short sequences.
continue continue
@ -78,16 +74,13 @@ def run_vllm(
disable_detokenize: bool = False, disable_detokenize: bool = False,
) -> float: ) -> float:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args)) llm = LLM(**dataclasses.asdict(engine_args))
assert all( assert all(
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2]) llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
for request in requests for request in requests), (
), ( "Please ensure that max_model_len is greater than the sum of"
"Please ensure that max_model_len is greater than the sum of" " input_len and output_len for all requests.")
" input_len and output_len for all requests."
)
# Add the requests to the engine. # Add the requests to the engine.
prompts = [] prompts = []
@ -104,8 +97,7 @@ def run_vllm(
ignore_eos=True, ignore_eos=True,
max_tokens=output_len, max_tokens=output_len,
detokenize=not disable_detokenize, detokenize=not disable_detokenize,
) ))
)
start = time.perf_counter() start = time.perf_counter()
llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True) llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
@ -119,33 +111,26 @@ def main(args: argparse.Namespace):
# Sample the requests. # Sample the requests.
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code args.tokenizer, trust_remote_code=args.trust_remote_code)
)
if args.dataset is None: if args.dataset is None:
# Synthesize a prompt with the given input length. # Synthesize a prompt with the given input length.
prompt = "hi" * (args.input_len - 1) prompt = "hi" * (args.input_len - 1)
requests = [ requests = [(prompt, args.input_len, args.output_len,
(prompt, args.input_len, args.output_len, get_random_flag()) get_random_flag()) for _ in range(args.num_prompts)]
for _ in range(args.num_prompts)
]
else: else:
requests = sample_requests( requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
args.dataset, args.num_prompts, tokenizer, args.output_len args.output_len)
)
if args.backend == "vllm": if args.backend == "vllm":
elapsed_time = run_vllm( elapsed_time = run_vllm(requests, args.n,
requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize EngineArgs.from_cli_args(args),
) args.disable_detokenize)
else: else:
raise ValueError(f"Unknown backend: {args.backend}") raise ValueError(f"Unknown backend: {args.backend}")
total_num_tokens = sum( total_num_tokens = sum(prompt_len + output_len
prompt_len + output_len for _, prompt_len, output_len, priority in requests for _, prompt_len, output_len, priority in requests)
) print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
print( f"{total_num_tokens / elapsed_time:.2f} tokens/s")
f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} tokens/s"
)
# Output JSON results if specified # Output JSON results if specified
if args.output_json: if args.output_json:
@ -162,44 +147,41 @@ def main(args: argparse.Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument("--backend",
type=str,
choices=["vllm", "hf", "mii"],
default="vllm")
parser.add_argument("--dataset",
type=str,
default=None,
help="Path to the dataset.")
parser.add_argument("--input-len",
type=int,
default=None,
help="Input prompt length for each request")
parser.add_argument("--output-len",
type=int,
default=None,
help="Output length for each request. Overrides the "
"output length from the dataset.")
parser.add_argument("--n",
type=int,
default=1,
help="Number of generated sequences per prompt.")
parser.add_argument("--num-prompts",
type=int,
default=200,
help="Number of prompts to process.")
parser.add_argument( parser.add_argument(
"--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm" '--output-json',
)
parser.add_argument(
"--dataset", type=str, default=None, help="Path to the dataset."
)
parser.add_argument(
"--input-len",
type=int,
default=None,
help="Input prompt length for each request",
)
parser.add_argument(
"--output-len",
type=int,
default=None,
help="Output length for each request. Overrides the "
"output length from the dataset.",
)
parser.add_argument(
"--n", type=int, default=1, help="Number of generated sequences per prompt."
)
parser.add_argument(
"--num-prompts", type=int, default=200, help="Number of prompts to process."
)
parser.add_argument(
"--output-json",
type=str, type=str,
default=None, default=None,
help="Path to save the throughput results in JSON format.", help='Path to save the throughput results in JSON format.')
)
parser.add_argument( parser.add_argument(
"--disable-detokenize", '--disable-detokenize',
action="store_true", action='store_true',
help=( help=("Do not detokenize responses (i.e. do not include "
"Do not detokenize responses (i.e. do not include " "detokenization time in the latency measurement)"),
"detokenization time in the latency measurement)"
),
) )
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)

File diff suppressed because it is too large Load Diff

View File

@ -5,13 +5,16 @@ On the server side, run one of the following commands:
(vLLM OpenAI API server) (vLLM OpenAI API server)
vllm serve <your_model> --disable-log-requests vllm serve <your_model> --disable-log-requests
(TGI backend)
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
On the client side, run: On the client side, run:
python benchmarks/benchmark_serving_structured_output.py \ python benchmarks/benchmark_serving_structured_output.py \
--backend <backend> \ --backend <backend> \
--model <your_model> \ --model <your_model> \
--dataset json \ --dataset json \
--structured-output-ratio 1.0 \ --structured-output-ratio 1.0 \
--structured-output-backend auto \ --structured-output-backend xgrammar \
--request-rate 10 \ --request-rate 10 \
--num-prompts 1000 --num-prompts 1000
@ -19,7 +22,6 @@ On the client side, run:
--endpoint /generate_stream --endpoint /generate_stream
to the end of the command above. to the end of the command above.
""" """
import argparse import argparse
import asyncio import asyncio
import copy import copy
@ -37,15 +39,11 @@ from typing import Optional
import datasets import datasets
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
RequestFuncOutput)
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
from backend_request_func import (
ASYNC_REQUEST_FUNCS,
RequestFuncInput,
RequestFuncOutput,
)
try: try:
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
except ImportError: except ImportError:
@ -56,9 +54,8 @@ try:
except ImportError: except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser from argparse import ArgumentParser as FlexibleArgumentParser
from vllm.v1.structured_output.backend_xgrammar import ( from vllm.v1.structured_output.utils import (
has_xgrammar_unsupported_json_features, has_xgrammar_unsupported_json_features)
)
MILLISECONDS_TO_SECONDS_CONVERSION = 1000 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -104,7 +101,6 @@ class SampleRequest:
prompt_len: The length of the prompt in tokens. prompt_len: The length of the prompt in tokens.
expected_output_len: The expected length of the output in tokens. expected_output_len: The expected length of the output in tokens.
""" """
prompt: str prompt: str
prompt_len: int prompt_len: int
expected_output_len: int expected_output_len: int
@ -113,61 +109,60 @@ class SampleRequest:
completion: str = None completion: str = None
def sample_requests( def sample_requests(tokenizer: PreTrainedTokenizerBase,
tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace args: argparse.Namespace) -> list[SampleRequest]:
) -> list[SampleRequest]: if args.dataset == 'json' or args.dataset == 'json-unique':
if args.dataset == "json" or args.dataset == "json-unique":
if args.json_schema_path is None: if args.json_schema_path is None:
dir_path = os.path.dirname(os.path.realpath(__file__)) dir_path = os.path.dirname(os.path.realpath(__file__))
args.json_schema_path = os.path.join( args.json_schema_path = os.path.join(dir_path,
dir_path, "structured_schemas", "structured_schema_1.json" "structured_schemas",
) "structured_schema_1.json")
json_schemas = [] json_schemas = []
with open(args.json_schema_path) as f: with open(args.json_schema_path) as f:
schema = json.load(f) schema = json.load(f)
if args.dataset == "json-unique": if args.dataset == 'json-unique':
json_schemas = [copy.deepcopy(schema) for _ in range(args.num_prompts)] json_schemas = [
copy.deepcopy(schema) for _ in range(args.num_prompts)
]
for i in range(len(json_schemas)): for i in range(len(json_schemas)):
if "properties" not in json_schemas[i]: json_schemas[i]["properties"][
json_schemas[i]["properties"] = {} f"__optional_field_{uuid.uuid4()}"] = {
json_schemas[i]["properties"][f"__optional_field_{uuid.uuid4()}"] = { "type":
"type": "string", "string",
"description": "An unique optional field to avoid cached schemas", "description":
} "An unique optional field to avoid cached schemas"
else: }
json_schemas = [schema] * args.num_prompts
def gen_prompt(index: int): def gen_prompt(index: int):
return f"Generate an example of a brief user profile given the following schema: {json.dumps(get_schema(index))}" # noqa: E501 schema = json_schemas[index % len(json_schemas)]
return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501
def get_schema(index: int): def get_schema(index: int):
return json_schemas[index % len(json_schemas)] return json_schemas[index % len(json_schemas)]
requests = [ requests = [
SampleRequest( SampleRequest(prompt=gen_prompt(i),
prompt=gen_prompt(i), prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
prompt_len=len(tokenizer(gen_prompt(i)).input_ids), expected_output_len=args.output_len,
expected_output_len=args.output_len, schema=get_schema(i),
schema=get_schema(i), structure_type=args.structure_type)
structure_type=args.structure_type,
)
for i in range(args.num_prompts) for i in range(args.num_prompts)
] ]
elif args.dataset == "grammar": elif args.dataset == "grammar":
schema = """ schema = """
root ::= select_statement ?start: select_statement
select_statement ::= "SELECT " column " from " table " where " condition ?select_statement: "SELECT " column_list " FROM " table_name
column ::= "col_1 " | "col_2 " ?column_list: column_name ("," column_name)*
table ::= "table_1 " | "table_2 " ?table_name: identifier
condition ::= column "= " number ?column_name: identifier
number ::= "1 " | "2 " ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
""" """
prompt = "Generate an SQL query to show the 'username' \ prompt = "Generate an SQL query to show the 'username' \
and 'email' from the 'users' table." and 'email' from the 'users' table."
@ -175,13 +170,11 @@ def sample_requests(
input_len = len(tokenizer(prompt).input_ids) input_len = len(tokenizer(prompt).input_ids)
print(f"Input length of the prompt: {input_len} tokens") print(f"Input length of the prompt: {input_len} tokens")
requests = [ requests = [
SampleRequest( SampleRequest(prompt=prompt,
prompt=prompt, prompt_len=input_len,
prompt_len=input_len, expected_output_len=args.output_len,
expected_output_len=args.output_len, schema=schema,
schema=schema, structure_type=args.structure_type)
structure_type=args.structure_type,
)
for _ in range(args.num_prompts) for _ in range(args.num_prompts)
] ]
@ -195,13 +188,11 @@ def sample_requests(
input_len = len(tokenizer(prompt).input_ids) input_len = len(tokenizer(prompt).input_ids)
print(f"Input length of the prompt: {input_len} tokens") print(f"Input length of the prompt: {input_len} tokens")
requests = [ requests = [
SampleRequest( SampleRequest(prompt=prompt,
prompt=prompt, prompt_len=input_len,
prompt_len=input_len, expected_output_len=args.output_len,
expected_output_len=args.output_len, schema=regex,
schema=regex, structure_type=args.structure_type)
structure_type=args.structure_type,
)
for _ in range(args.num_prompts) for _ in range(args.num_prompts)
] ]
@ -212,55 +203,47 @@ def sample_requests(
input_len = len(tokenizer(prompt).input_ids) input_len = len(tokenizer(prompt).input_ids)
print(f"Input length of the prompt: {input_len} tokens") print(f"Input length of the prompt: {input_len} tokens")
requests = [ requests = [
SampleRequest( SampleRequest(prompt=prompt,
prompt=prompt, prompt_len=input_len,
prompt_len=input_len, expected_output_len=args.output_len,
expected_output_len=args.output_len, schema=choice,
schema=choice, structure_type=args.structure_type)
structure_type=args.structure_type,
)
for _ in range(args.num_prompts) for _ in range(args.num_prompts)
] ]
elif args.dataset == "xgrammar_bench": elif args.dataset == "xgrammar_bench":
requests: list[SampleRequest] = [] requests: list[SampleRequest] = []
dataset = datasets.load_dataset("NousResearch/json-mode-eval", split="train") dataset = datasets.load_dataset("NousResearch/json-mode-eval",
split="train")
full_dataset_len = len(dataset) full_dataset_len = len(dataset)
def _filter_func(item): def _filter_func(item):
import json import json
schema = json.loads(item["schema"]) schema = json.loads(item["schema"])
return not has_xgrammar_unsupported_json_features(schema) return not has_xgrammar_unsupported_json_features(schema)
dataset = dataset.filter(_filter_func) dataset = dataset.filter(_filter_func)
num_filtered_out = full_dataset_len - len(dataset) num_filtered_out = full_dataset_len - len(dataset)
print( print(f"dataset has {len(dataset)} entries after filtering "
f"dataset has {len(dataset)} entries after filtering " f"out {num_filtered_out} entries with unsupported features")
f"out {num_filtered_out} entries with unsupported features"
)
len_dataset = len(dataset) len_dataset = len(dataset)
for data_point_idx in range(args.num_prompts): for data_point_idx in range(args.num_prompts):
idx = data_point_idx idx = data_point_idx
while idx >= len_dataset: while idx >= len_dataset:
idx -= len_dataset idx -= len_dataset
schema = dataset["schema"][idx] schema = dataset["schema"][idx]
prompt = tokenizer.apply_chat_template( prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
dataset["prompt"][idx], tokenize=False, add_generation_prompt=True tokenize=False)
)
input_len = len(tokenizer(prompt).input_ids) input_len = len(tokenizer(prompt).input_ids)
completion = dataset["completion"][idx] completion = dataset["completion"][idx]
requests.append( requests.append(
SampleRequest( SampleRequest(prompt=prompt,
prompt=prompt, prompt_len=input_len,
prompt_len=input_len, expected_output_len=args.output_len,
expected_output_len=args.output_len, schema=schema,
schema=schema, structure_type=args.structure_type,
structure_type=args.structure_type, completion=completion))
completion=completion,
)
)
return requests return requests
@ -292,8 +275,7 @@ async def get_request(
# Calculate scale parameter theta to maintain the desired request_rate. # Calculate scale parameter theta to maintain the desired request_rate.
assert burstiness > 0, ( assert burstiness > 0, (
f"A positive burstiness factor is expected, but given {burstiness}." f"A positive burstiness factor is expected, but given {burstiness}.")
)
theta = 1.0 / (request_rate * burstiness) theta = 1.0 / (request_rate * burstiness)
for i, request in enumerate(input_requests): for i, request in enumerate(input_requests):
@ -335,8 +317,8 @@ def calculate_metrics(
# multiple output tokens may be bundled together # multiple output tokens may be bundled together
# Note : this may inflate the output token count slightly # Note : this may inflate the output token count slightly
output_len = len( output_len = len(
tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids tokenizer(outputs[i].generated_text,
) add_special_tokens=False).input_ids)
actual_output_lens.append(output_len) actual_output_lens.append(output_len)
total_input += input_requests[i].prompt_len total_input += input_requests[i].prompt_len
tpot = 0 tpot = 0
@ -360,19 +342,16 @@ def calculate_metrics(
if "ttft" in goodput_config_dict: if "ttft" in goodput_config_dict:
valid_metrics.append(ttfts) valid_metrics.append(ttfts)
slo_values.append( slo_values.append(goodput_config_dict["ttft"] /
goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION MILLISECONDS_TO_SECONDS_CONVERSION)
)
if "tpot" in goodput_config_dict: if "tpot" in goodput_config_dict:
valid_metrics.append(all_tpots) valid_metrics.append(all_tpots)
slo_values.append( slo_values.append(goodput_config_dict["tpot"] /
goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION MILLISECONDS_TO_SECONDS_CONVERSION)
)
if "e2el" in goodput_config_dict: if "e2el" in goodput_config_dict:
valid_metrics.append(e2els) valid_metrics.append(e2els)
slo_values.append( slo_values.append(goodput_config_dict["e2el"] /
goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION MILLISECONDS_TO_SECONDS_CONVERSION)
)
for req_metric in zip(*valid_metrics): for req_metric in zip(*valid_metrics):
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
@ -383,8 +362,7 @@ def calculate_metrics(
warnings.warn( warnings.warn(
"All requests failed. This is likely due to a misconfiguration " "All requests failed. This is likely due to a misconfiguration "
"on the benchmark arguments.", "on the benchmark arguments.",
stacklevel=2, stacklevel=2)
)
metrics = BenchmarkMetrics( metrics = BenchmarkMetrics(
completed=completed, completed=completed,
total_input=total_input, total_input=total_input,
@ -393,31 +371,27 @@ def calculate_metrics(
request_goodput=good_completed / dur_s, request_goodput=good_completed / dur_s,
output_throughput=sum(actual_output_lens) / dur_s, output_throughput=sum(actual_output_lens) / dur_s,
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
mean_ttft_ms=np.mean(ttfts or 0) mean_ttft_ms=np.mean(ttfts or 0) *
* 1000, # ttfts is empty if streaming is not supported by backend 1000, # ttfts is empty if streaming is not supported by backend
std_ttft_ms=np.std(ttfts or 0) * 1000, std_ttft_ms=np.std(ttfts or 0) * 1000,
median_ttft_ms=np.median(ttfts or 0) * 1000, median_ttft_ms=np.median(ttfts or 0) * 1000,
percentiles_ttft_ms=[ percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles for p in selected_percentiles],
],
mean_tpot_ms=np.mean(tpots or 0) * 1000, mean_tpot_ms=np.mean(tpots or 0) * 1000,
std_tpot_ms=np.std(tpots or 0) * 1000, std_tpot_ms=np.std(tpots or 0) * 1000,
median_tpot_ms=np.median(tpots or 0) * 1000, median_tpot_ms=np.median(tpots or 0) * 1000,
percentiles_tpot_ms=[ percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles for p in selected_percentiles],
],
mean_itl_ms=np.mean(itls or 0) * 1000, mean_itl_ms=np.mean(itls or 0) * 1000,
std_itl_ms=np.std(itls or 0) * 1000, std_itl_ms=np.std(itls or 0) * 1000,
median_itl_ms=np.median(itls or 0) * 1000, median_itl_ms=np.median(itls or 0) * 1000,
percentiles_itl_ms=[ percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles for p in selected_percentiles],
],
mean_e2el_ms=np.mean(e2els or 0) * 1000, mean_e2el_ms=np.mean(e2els or 0) * 1000,
std_e2el_ms=np.std(e2els or 0) * 1000, std_e2el_ms=np.std(e2els or 0) * 1000,
median_e2el_ms=np.median(e2els or 0) * 1000, median_e2el_ms=np.median(e2els or 0) * 1000,
percentiles_e2el_ms=[ percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles for p in selected_percentiles],
],
) )
return metrics, actual_output_lens return metrics, actual_output_lens
@ -439,6 +413,7 @@ async def benchmark(
ignore_eos: bool, ignore_eos: bool,
max_concurrency: Optional[int], max_concurrency: Optional[int],
structured_output_ratio: float, structured_output_ratio: float,
structured_output_backend: str,
goodput_config_dict: Optional[dict[str, float]] = None, goodput_config_dict: Optional[dict[str, float]] = None,
): ):
if backend in ASYNC_REQUEST_FUNCS: if backend in ASYNC_REQUEST_FUNCS:
@ -450,17 +425,18 @@ async def benchmark(
extra_body = {} extra_body = {}
# Add the schema to the extra_body # Add the schema to the extra_body
extra_body[request.structure_type] = request.schema extra_body[request.structure_type] = request.schema
# Add the specific structured_output_backend
extra_body["guided_decoding_backend"] = structured_output_backend
return extra_body return extra_body
print("Starting initial single prompt test run...") print("Starting initial single prompt test run...")
structured_output_req_idx = random.sample( structured_output_req_idx = random.sample(
range(len(input_requests)), int(len(input_requests) * structured_output_ratio) range(len(input_requests)),
) int(len(input_requests) * structured_output_ratio))
test_request = input_requests[0] test_request = input_requests[0]
test_req_extra_body = ( test_req_extra_body = (prepare_extra_body(test_request)
prepare_extra_body(test_request) if 0 in structured_output_req_idx else None if 0 in structured_output_req_idx else None)
)
test_input = RequestFuncInput( test_input = RequestFuncInput(
model=model_id, model=model_id,
prompt=test_request.prompt, prompt=test_request.prompt,
@ -474,8 +450,7 @@ async def benchmark(
if not test_output.success: if not test_output.success:
raise ValueError( raise ValueError(
"Initial test run failed - Please make sure benchmark arguments " "Initial test run failed - Please make sure benchmark arguments "
f"are correctly specified. Error: {test_output.error}" f"are correctly specified. Error: {test_output.error}")
)
else: else:
print("Initial test run completed. Starting main benchmark run...") print("Initial test run completed. Starting main benchmark run...")
@ -494,7 +469,10 @@ async def benchmark(
if profile_output.success: if profile_output.success:
print("Profiler started") print("Profiler started")
distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution" if burstiness == 1.0:
distribution = "Poisson process"
else:
distribution = "Gamma distribution"
print(f"Traffic request rate: {request_rate}") print(f"Traffic request rate: {request_rate}")
print(f"Burstiness factor: {burstiness} ({distribution})") print(f"Burstiness factor: {burstiness} ({distribution})")
@ -506,21 +484,24 @@ async def benchmark(
# and it will simplify the code in limited_request_func. # and it will simplify the code in limited_request_func.
# semaphore = (asyncio.Semaphore(max_concurrency) # semaphore = (asyncio.Semaphore(max_concurrency)
# if max_concurrency else contextlib.nullcontext()) # if max_concurrency else contextlib.nullcontext())
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None semaphore = (asyncio.Semaphore(max_concurrency)
if max_concurrency else None)
async def limited_request_func(request_func_input, pbar): async def limited_request_func(request_func_input, pbar):
if semaphore is None: if semaphore is None:
return await request_func(request_func_input=request_func_input, pbar=pbar) return await request_func(request_func_input=request_func_input,
pbar=pbar)
async with semaphore: async with semaphore:
return await request_func(request_func_input=request_func_input, pbar=pbar) return await request_func(request_func_input=request_func_input,
pbar=pbar)
benchmark_start_time = time.perf_counter() benchmark_start_time = time.perf_counter()
tasks: list[asyncio.Task] = [] tasks: list[asyncio.Task] = []
expected: list[str] = [] expected: list[str] = []
async for i, request in get_request(input_requests, request_rate, burstiness): async for i, request in get_request(input_requests, request_rate,
extra_body = ( burstiness):
prepare_extra_body(request) if i in structured_output_req_idx else None extra_body = prepare_extra_body(
) request) if i in structured_output_req_idx else None
request_func_input = RequestFuncInput( request_func_input = RequestFuncInput(
model=model_id, model=model_id,
prompt=request.prompt, prompt=request.prompt,
@ -533,9 +514,8 @@ async def benchmark(
expected.append(request.completion) expected.append(request.completion)
tasks.append( tasks.append(
asyncio.create_task( asyncio.create_task(
limited_request_func(request_func_input=request_func_input, pbar=pbar) limited_request_func(request_func_input=request_func_input,
) pbar=pbar)))
)
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
if profile: if profile:
@ -567,58 +547,54 @@ async def benchmark(
goodput_config_dict=goodput_config_dict, goodput_config_dict=goodput_config_dict,
) )
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
benchmark_duration))
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) print("{:<40} {:<10}".format("Total generated tokens:",
print( metrics.total_output))
"{:<40} {:<10.2f}".format( print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
"Request throughput (req/s):", metrics.request_throughput metrics.request_throughput))
)
)
if goodput_config_dict: if goodput_config_dict:
print( print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
"{:<40} {:<10.2f}".format( metrics.request_goodput))
"Request goodput (req/s):", metrics.request_goodput print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
) metrics.output_throughput))
) print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
print( metrics.total_token_throughput))
"{:<40} {:<10.2f}".format(
"Output token throughput (tok/s):", metrics.output_throughput
)
)
print(
"{:<40} {:<10.2f}".format(
"Total Token throughput (tok/s):", metrics.total_token_throughput
)
)
result = { result = {
"duration": benchmark_duration, "duration":
"completed": metrics.completed, benchmark_duration,
"total_input_tokens": metrics.total_input, "completed":
"total_output_tokens": metrics.total_output, metrics.completed,
"request_throughput": metrics.request_throughput, "total_input_tokens":
"output_throughput": metrics.output_throughput, metrics.total_input,
"total_token_throughput": metrics.total_token_throughput, "total_output_tokens":
"ttft_description": pd.Series([output.ttft for output in outputs]) metrics.total_output,
.describe() "request_throughput":
.to_dict(), metrics.request_throughput,
"tpot_description": pd.Series([output.tpot for output in outputs]) "output_throughput":
.describe() metrics.output_throughput,
.to_dict(), "total_token_throughput":
metrics.total_token_throughput,
"ttft_description":
pd.Series([output.ttft for output in outputs]).describe().to_dict(),
"tpot_description":
pd.Series([output.tpot for output in outputs]).describe().to_dict(),
"input_lens": [output.prompt_len for output in outputs], "input_lens": [output.prompt_len for output in outputs],
"output_lens": actual_output_lens, "output_lens":
actual_output_lens,
"ttfts": [output.ttft for output in outputs], "ttfts": [output.ttft for output in outputs],
"itls": [output.itl for output in outputs], "itls": [output.itl for output in outputs],
"errors": [output.error for output in outputs], "errors": [output.error for output in outputs],
} }
ret = [ ret = [{
{"generated": output.generated_text, "expected": gt} 'generated': output.generated_text,
for output, gt in zip(outputs, expected) 'expected': gt
] } for output, gt in zip(outputs, expected)]
def process_one_metric( def process_one_metric(
# E.g., "ttft" # E.g., "ttft"
@ -632,35 +608,29 @@ async def benchmark(
# metric. # metric.
if metric_attribute_name not in selected_percentile_metrics: if metric_attribute_name not in selected_percentile_metrics:
return return
print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
print( print("{:<40} {:<10.2f}".format(
"{:<40} {:<10.2f}".format( f"Mean {metric_name} (ms):",
f"Mean {metric_name} (ms):", getattr(metrics, f"mean_{metric_attribute_name}_ms")))
getattr(metrics, f"mean_{metric_attribute_name}_ms"), print("{:<40} {:<10.2f}".format(
) f"Median {metric_name} (ms):",
) getattr(metrics, f"median_{metric_attribute_name}_ms")))
print(
"{:<40} {:<10.2f}".format(
f"Median {metric_name} (ms):",
getattr(metrics, f"median_{metric_attribute_name}_ms"),
)
)
result[f"mean_{metric_attribute_name}_ms"] = getattr( result[f"mean_{metric_attribute_name}_ms"] = getattr(
metrics, f"mean_{metric_attribute_name}_ms" metrics, f"mean_{metric_attribute_name}_ms")
)
result[f"median_{metric_attribute_name}_ms"] = getattr( result[f"median_{metric_attribute_name}_ms"] = getattr(
metrics, f"median_{metric_attribute_name}_ms" metrics, f"median_{metric_attribute_name}_ms")
)
result[f"std_{metric_attribute_name}_ms"] = getattr( result[f"std_{metric_attribute_name}_ms"] = getattr(
metrics, f"std_{metric_attribute_name}_ms" metrics, f"std_{metric_attribute_name}_ms")
) for p, value in getattr(metrics,
for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): f"percentiles_{metric_attribute_name}_ms"):
p_word = str(int(p)) if int(p) == p else str(p) p_word = str(int(p)) if int(p) == p else str(p)
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
value))
result[f"p{p_word}_{metric_attribute_name}_ms"] = value result[f"p{p_word}_{metric_attribute_name}_ms"] = value
process_one_metric("ttft", "TTFT", "Time to First Token") process_one_metric("ttft", "TTFT", "Time to First Token")
process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") process_one_metric("tpot", "TPOT",
"Time per Output Token (excl. 1st token)")
process_one_metric("itl", "ITL", "Inter-token Latency") process_one_metric("itl", "ITL", "Inter-token Latency")
process_one_metric("e2el", "E2EL", "End-to-end Latency") process_one_metric("e2el", "E2EL", "End-to-end Latency")
@ -670,13 +640,13 @@ async def benchmark(
def evaluate(ret, args): def evaluate(ret, args):
def _eval_correctness_json(expected, actual): def _eval_correctness_json(expected, actual):
# extract json string from string using regex # extract json string from string using regex
import re import re
actual = actual.replace('\n', '').replace(' ', '').strip()
actual = actual.replace("\n", "").replace(" ", "").strip()
try: try:
actual = re.search(r"\{.*\}", actual).group() actual = re.search(r'\{.*\}', actual).group()
actual = json.loads(actual) actual = json.loads(actual)
except Exception: except Exception:
return False return False
@ -688,32 +658,28 @@ def evaluate(ret, args):
def _eval_correctness_regex(expected, actual): def _eval_correctness_regex(expected, actual):
import re import re
return re.match(args.regex, actual) is not None return re.match(args.regex, actual) is not None
def _eval_correctness(expected, actual): def _eval_correctness(expected, actual):
if args.structure_type == "guided_json": if args.structure_type == 'guided_json':
return _eval_correctness_json(expected, actual) return _eval_correctness_json(expected, actual)
elif args.structure_type == "guided_regex": elif args.structure_type == 'guided_regex':
return _eval_correctness_regex(expected, actual) return _eval_correctness_regex(expected, actual)
elif args.structure_type == "guided_choice": elif args.structure_type == 'guided_choice':
return _eval_correctness_choice(expected, actual) return _eval_correctness_choice(expected, actual)
else: else:
return None return None
scores = [] scores = []
for res in ret: for res in ret:
score = _eval_correctness(res["expected"], res["generated"]) score = _eval_correctness(res['expected'], res['generated'])
res["correctness"] = score res['correctness'] = score
scores.append(score) scores.append(score)
not_none_scores = [score for score in scores if score is not None] not_none_scores = [score for score in scores if score is not None]
return ( return (sum(not_none_scores) / len(not_none_scores) *
(sum(not_none_scores) / len(not_none_scores) * 100) 100) if len(not_none_scores) > 0 else None
if len(not_none_scores) > 0
else None
)
def parse_goodput(slo_pairs): def parse_goodput(slo_pairs):
@ -725,10 +691,9 @@ def parse_goodput(slo_pairs):
except ValueError as err: except ValueError as err:
raise argparse.ArgumentTypeError( raise argparse.ArgumentTypeError(
"Invalid format found for service level objectives. " "Invalid format found for service level objectives. "
'Specify service level objectives for goodput as "KEY:VALUE" ' "Specify service level objectives for goodput as \"KEY:VALUE\" "
"pairs, where the key is a metric name, and the value is a " "pairs, where the key is a metric name, and the value is a "
"number in milliseconds." "number in milliseconds.") from err
) from err
return goodput_config_dict return goodput_config_dict
@ -742,14 +707,12 @@ def check_goodput_args(args):
raise ValueError( raise ValueError(
f"Invalid metric name found, {slo_name}: {slo_val}. " f"Invalid metric name found, {slo_name}: {slo_val}. "
"The service level objective name should be one of " "The service level objective name should be one of "
f"{str(VALID_NAMES)}. " f"{str(VALID_NAMES)}. ")
)
if slo_val < 0: if slo_val < 0:
raise ValueError( raise ValueError(
f"Invalid value found, {slo_name}: {slo_val}. " f"Invalid value found, {slo_name}: {slo_val}. "
"The service level objective value should be " "The service level objective value should be "
"non-negative." "non-negative.")
)
return goodput_config_dict return goodput_config_dict
@ -775,19 +738,19 @@ def main(args: argparse.Namespace):
tokenizer_mode=args.tokenizer_mode, tokenizer_mode=args.tokenizer_mode,
) )
if args.dataset == "grammar": if args.dataset == 'grammar':
args.structure_type = "guided_grammar" args.structure_type = 'guided_grammar'
elif args.dataset == "regex": elif args.dataset == 'regex':
args.structure_type = "guided_regex" args.structure_type = 'guided_regex'
elif args.dataset == "choice": elif args.dataset == 'choice':
args.structure_type = "guided_choice" args.structure_type = 'guided_choice'
else: else:
args.structure_type = "guided_json" args.structure_type = 'guided_json'
if args.no_structured_output: if args.no_structured_output:
args.structured_output_ratio = 0 args.structured_output_ratio = 0
if args.save_results: if args.save_results:
result_file_name = f"{args.structured_output_ratio}guided" result_file_name = f'{args.structured_output_ratio}guided'
result_file_name += f"_{backend}" result_file_name += f"_{backend}"
result_file_name += f"_{args.request_rate}qps" result_file_name += f"_{args.request_rate}qps"
result_file_name += f"_{args.model.split('/')[-1]}" result_file_name += f"_{args.model.split('/')[-1]}"
@ -815,29 +778,37 @@ def main(args: argparse.Namespace):
disable_tqdm=args.disable_tqdm, disable_tqdm=args.disable_tqdm,
profile=args.profile, profile=args.profile,
selected_percentile_metrics=args.percentile_metrics.split(","), selected_percentile_metrics=args.percentile_metrics.split(","),
selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")], selected_percentiles=[
float(p) for p in args.metric_percentiles.split(",")
],
ignore_eos=args.ignore_eos, ignore_eos=args.ignore_eos,
max_concurrency=args.max_concurrency, max_concurrency=args.max_concurrency,
structured_output_ratio=args.structured_output_ratio, structured_output_ratio=args.structured_output_ratio,
structured_output_backend=args.structured_output_backend,
goodput_config_dict=goodput_config_dict, goodput_config_dict=goodput_config_dict,
) ))
)
# Save config and results to json # Save config and results to json
score = evaluate(ret, args) score = evaluate(ret, args)
print("correct_rate(%)", score, "\n") print("correct_rate(%)", score, '\n')
if args.save_results: if args.save_results:
results = { results = {
"backend": backend, "backend":
"model_id": model_id, backend,
"tokenizer_id": tokenizer_id, "model_id":
"num_prompts": args.num_prompts, model_id,
"request_rate": args.request_rate "tokenizer_id":
if args.request_rate < float("inf") tokenizer_id,
else "inf", "num_prompts":
"burstiness": args.burstiness, args.num_prompts,
"max_concurrency": args.max_concurrency, "request_rate":
"correct_rate(%)": score, args.request_rate if args.request_rate < float("inf") else "inf",
"burstiness":
args.burstiness,
"max_concurrency":
args.max_concurrency,
"correct_rate(%)":
score
} }
results = {"outputs": ret, **results, **benchmark_result} results = {"outputs": ret, **results, **benchmark_result}
@ -846,14 +817,13 @@ def main(args: argparse.Namespace):
result_file_name = args.result_filename result_file_name = args.result_filename
if args.result_dir: if args.result_dir:
result_file_name = os.path.join(args.result_dir, result_file_name) result_file_name = os.path.join(args.result_dir, result_file_name)
with open(result_file_name, "w", encoding="utf-8") as outfile: with open(result_file_name, "w", encoding='utf-8') as outfile:
json.dump(results, outfile, indent=4) json.dump(results, outfile, indent=4)
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the online serving throughput." description="Benchmark the online serving throughput.")
)
parser.add_argument( parser.add_argument(
"--backend", "--backend",
type=str, type=str,
@ -875,14 +845,16 @@ if __name__ == "__main__":
default="/v1/completions", default="/v1/completions",
help="API endpoint.", help="API endpoint.",
) )
parser.add_argument( parser.add_argument("--dataset",
"--dataset", default='json',
default="json", choices=[
choices=["json", "json-unique", "grammar", "regex", "choice", "xgrammar_bench"], 'json', 'json-unique', 'grammar', 'regex',
) 'choice', 'xgrammar_bench'
parser.add_argument( ])
"--json-schema-path", type=str, default=None, help="Path to json schema." parser.add_argument("--json_schema_path",
) type=str,
default=None,
help="Path to json schema.")
parser.add_argument( parser.add_argument(
"--max-concurrency", "--max-concurrency",
type=int, type=int,
@ -894,8 +866,7 @@ if __name__ == "__main__":
"initiated, this argument will control how many are actually allowed " "initiated, this argument will control how many are actually allowed "
"to execute at a time. This means that when used in combination, the " "to execute at a time. This means that when used in combination, the "
"actual request rate may be lower than specified with --request-rate, " "actual request rate may be lower than specified with --request-rate, "
"if the server is not processing requests fast enough to keep up.", "if the server is not processing requests fast enough to keep up.")
)
parser.add_argument( parser.add_argument(
"--model", "--model",
type=str, type=str,
@ -905,13 +876,15 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--tokenizer", "--tokenizer",
type=str, type=str,
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 help=
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
) )
parser.add_argument( parser.add_argument(
"--tokenizer-mode", "--tokenizer-mode",
type=str, type=str,
default="auto", default="auto",
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 help=
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
) )
parser.add_argument( parser.add_argument(
"--num-prompts", "--num-prompts",
@ -988,51 +961,50 @@ if __name__ == "__main__":
"--ignore-eos", "--ignore-eos",
action="store_true", action="store_true",
help="Set ignore_eos flag when sending the benchmark request." help="Set ignore_eos flag when sending the benchmark request."
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.", "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
)
parser.add_argument( parser.add_argument(
"--percentile-metrics", "--percentile-metrics",
type=str, type=str,
default="ttft,tpot,itl", default="ttft,tpot,itl",
help="Comma-separated list of selected metrics to report percentils. " help="Comma-seperated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. " "This argument specifies the metrics to report percentiles. "
'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
'Default value is "ttft,tpot,itl".', "Default value is \"ttft,tpot,itl\".")
)
parser.add_argument( parser.add_argument(
"--metric-percentiles", "--metric-percentiles",
type=str, type=str,
default="99", default="99",
help="Comma-separated list of percentiles for selected metrics. " help="Comma-seperated list of percentiles for selected metrics. "
'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". ' "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
'Default value is "99". ' "Default value is \"99\". "
'Use "--percentile-metrics" to select metrics.', "Use \"--percentile-metrics\" to select metrics.",
) )
parser.add_argument( parser.add_argument(
"--goodput", "--goodput",
nargs="+", nargs="+",
required=False, required=False,
help='Specify service level objectives for goodput as "KEY:VALUE" ' help="Specify service level objectives for goodput as \"KEY:VALUE\" "
"pairs, where the key is a metric name, and the value is in " "pairs, where the key is a metric name, and the value is in "
'milliseconds. Multiple "KEY:VALUE" pairs can be provided, ' "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
"separated by spaces. Allowed request level metric names are " "separated by spaces. Allowed request level metric names are "
'"ttft", "tpot", "e2el". For more context on the definition of ' "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
"and the blog: https://hao-ai-lab.github.io/blogs/distserve", "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
)
parser.add_argument("--no-structured-output",
action='store_true',
default=False,
help="Whether to disable JSON decoding or not.")
parser.add_argument("--structured-output-ratio",
type=float,
default=1.0,
help="Ratio of Structured Outputs requests")
parser.add_argument( parser.add_argument(
"--no-structured-output", "--structured-output-backend",
action="store_true", type=str,
default=False, choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
help="Whether to disable JSON decoding or not.", default="xgrammar",
) help="Backend to use for structured outputs")
parser.add_argument(
"--structured-output-ratio",
type=float,
default=1.0,
help="Ratio of Structured Outputs requests",
)
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Benchmark offline inference throughput.""" """Benchmark offline inference throughput."""
import argparse import argparse
import dataclasses import dataclasses
import json import json
@ -12,25 +11,17 @@ from typing import Any, Optional, Union
import torch import torch
import uvloop import uvloop
from tqdm import tqdm from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase RandomDataset, SampleRequest, ShareGPTDataset,
SonnetDataset, VisionArenaDataset)
from benchmark_dataset import (
AIMODataset,
BurstGPTDataset,
ConversationDataset,
InstructCoderDataset,
RandomDataset,
SampleRequest,
ShareGPTDataset,
SonnetDataset,
VisionArenaDataset,
)
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase)
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import ( from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args, build_async_engine_client_from_engine_args)
)
from vllm.inputs import TextPrompt, TokensPrompt from vllm.inputs import TextPrompt, TokensPrompt
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
@ -45,30 +36,23 @@ def run_vllm(
disable_detokenize: bool = False, disable_detokenize: bool = False,
) -> tuple[float, Optional[list[RequestOutput]]]: ) -> tuple[float, Optional[list[RequestOutput]]]:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args)) llm = LLM(**dataclasses.asdict(engine_args))
assert all( assert all(
llm.llm_engine.model_config.max_model_len llm.llm_engine.model_config.max_model_len >= (
>= (request.prompt_len + request.expected_output_len) request.prompt_len + request.expected_output_len)
for request in requests for request in requests), (
), ( "Please ensure that max_model_len is greater than the sum of"
"Please ensure that max_model_len is greater than the sum of" " prompt_len and expected_output_len for all requests.")
" prompt_len and expected_output_len for all requests."
)
# Add the requests to the engine. # Add the requests to the engine.
prompts: list[Union[TextPrompt, TokensPrompt]] = [] prompts: list[Union[TextPrompt, TokensPrompt]] = []
sampling_params: list[SamplingParams] = [] sampling_params: list[SamplingParams] = []
for request in requests: for request in requests:
prompts.append( prompts.append(
TokensPrompt( TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
prompt_token_ids=request.prompt["prompt_token_ids"], multi_modal_data=request.multi_modal_data)
multi_modal_data=request.multi_modal_data, if "prompt_token_ids" in request.prompt else \
) TextPrompt(prompt=request.prompt,
if "prompt_token_ids" in request.prompt multi_modal_data=request.multi_modal_data))
else TextPrompt(
prompt=request.prompt, multi_modal_data=request.multi_modal_data
)
)
sampling_params.append( sampling_params.append(
SamplingParams( SamplingParams(
n=n, n=n,
@ -77,8 +61,7 @@ def run_vllm(
ignore_eos=True, ignore_eos=True,
max_tokens=request.expected_output_len, max_tokens=request.expected_output_len,
detokenize=not disable_detokenize, detokenize=not disable_detokenize,
) ))
)
lora_requests: Optional[list[LoRARequest]] = None lora_requests: Optional[list[LoRARequest]] = None
if engine_args.enable_lora: if engine_args.enable_lora:
lora_requests = [request.lora_request for request in requests] lora_requests = [request.lora_request for request in requests]
@ -88,9 +71,10 @@ def run_vllm(
outputs = None outputs = None
if not use_beam_search: if not use_beam_search:
start = time.perf_counter() start = time.perf_counter()
outputs = llm.generate( outputs = llm.generate(prompts,
prompts, sampling_params, lora_request=lora_requests, use_tqdm=True sampling_params,
) lora_request=lora_requests,
use_tqdm=True)
end = time.perf_counter() end = time.perf_counter()
else: else:
assert lora_requests is None, "BeamSearch API does not support LoRA" assert lora_requests is None, "BeamSearch API does not support LoRA"
@ -106,35 +90,30 @@ def run_vllm(
beam_width=n, beam_width=n,
max_tokens=output_len, max_tokens=output_len,
ignore_eos=True, ignore_eos=True,
), ))
)
end = time.perf_counter() end = time.perf_counter()
return end - start, outputs return end - start, outputs
def run_vllm_chat( def run_vllm_chat(
requests: list[SampleRequest], requests: list[SampleRequest],
n: int, n: int,
engine_args: EngineArgs, engine_args: EngineArgs,
disable_detokenize: bool = False, disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
) -> tuple[float, list[RequestOutput]]:
""" """
Run vLLM chat benchmark. This function is recommended ONLY for benchmarking Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
multimodal models as it properly handles multimodal inputs and chat multimodal models as it properly handles multimodal inputs and chat
formatting. For non-multimodal models, use run_vllm() instead. formatting. For non-multimodal models, use run_vllm() instead.
""" """
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args)) llm = LLM(**dataclasses.asdict(engine_args))
assert all( assert all(
llm.llm_engine.model_config.max_model_len llm.llm_engine.model_config.max_model_len >= (
>= (request.prompt_len + request.expected_output_len) request.prompt_len + request.expected_output_len)
for request in requests for request in requests), (
), ( "Please ensure that max_model_len is greater than the sum of "
"Please ensure that max_model_len is greater than the sum of " "prompt_len and expected_output_len for all requests.")
"prompt_len and expected_output_len for all requests."
)
prompts = [] prompts = []
sampling_params: list[SamplingParams] = [] sampling_params: list[SamplingParams] = []
@ -148,8 +127,7 @@ def run_vllm_chat(
ignore_eos=True, ignore_eos=True,
max_tokens=request.expected_output_len, max_tokens=request.expected_output_len,
detokenize=not disable_detokenize, detokenize=not disable_detokenize,
) ))
)
start = time.perf_counter() start = time.perf_counter()
outputs = llm.chat(prompts, sampling_params, use_tqdm=True) outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
end = time.perf_counter() end = time.perf_counter()
@ -166,17 +144,13 @@ async def run_vllm_async(
from vllm import SamplingParams from vllm import SamplingParams
async with build_async_engine_client_from_engine_args( async with build_async_engine_client_from_engine_args(
engine_args, disable_frontend_multiprocessing engine_args, disable_frontend_multiprocessing) as llm:
) as llm:
model_config = await llm.get_model_config()
assert all( assert all(
model_config.max_model_len llm.model_config.max_model_len >= (request.prompt_len +
>= (request.prompt_len + request.expected_output_len) request.expected_output_len)
for request in requests for request in requests), (
), ( "Please ensure that max_model_len is greater than the sum of"
"Please ensure that max_model_len is greater than the sum of" " prompt_len and expected_output_len for all requests.")
" prompt_len and expected_output_len for all requests."
)
# Add the requests to the engine. # Add the requests to the engine.
prompts: list[Union[TextPrompt, TokensPrompt]] = [] prompts: list[Union[TextPrompt, TokensPrompt]] = []
@ -184,15 +158,11 @@ async def run_vllm_async(
lora_requests: list[Optional[LoRARequest]] = [] lora_requests: list[Optional[LoRARequest]] = []
for request in requests: for request in requests:
prompts.append( prompts.append(
TokensPrompt( TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
prompt_token_ids=request.prompt["prompt_token_ids"], multi_modal_data=request.multi_modal_data)
multi_modal_data=request.multi_modal_data, if "prompt_token_ids" in request.prompt else \
) TextPrompt(prompt=request.prompt,
if "prompt_token_ids" in request.prompt multi_modal_data=request.multi_modal_data))
else TextPrompt(
prompt=request.prompt, multi_modal_data=request.multi_modal_data
)
)
sampling_params.append( sampling_params.append(
SamplingParams( SamplingParams(
n=n, n=n,
@ -201,16 +171,17 @@ async def run_vllm_async(
ignore_eos=True, ignore_eos=True,
max_tokens=request.expected_output_len, max_tokens=request.expected_output_len,
detokenize=not disable_detokenize, detokenize=not disable_detokenize,
) ))
)
lora_requests.append(request.lora_request) lora_requests.append(request.lora_request)
generators = [] generators = []
start = time.perf_counter() start = time.perf_counter()
for i, (prompt, sp, lr) in enumerate( for i, (prompt, sp,
zip(prompts, sampling_params, lora_requests) lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
): generator = llm.generate(prompt,
generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}") sp,
lora_request=lr,
request_id=f"test{i}")
generators.append(generator) generators.append(generator)
all_gens = merge_async_iterators(*generators) all_gens = merge_async_iterators(*generators)
async for i, res in all_gens: async for i, res in all_gens:
@ -229,8 +200,7 @@ def run_hf(
disable_detokenize: bool = False, disable_detokenize: bool = False,
) -> float: ) -> float:
llm = AutoModelForCausalLM.from_pretrained( llm = AutoModelForCausalLM.from_pretrained(
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
)
if llm.config.model_type == "llama": if llm.config.model_type == "llama":
# To enable padding in the HF backend. # To enable padding in the HF backend.
tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token = tokenizer.eos_token
@ -242,26 +212,22 @@ def run_hf(
max_prompt_len = 0 max_prompt_len = 0
max_output_len = 0 max_output_len = 0
for i in range(len(requests)): for i in range(len(requests)):
prompt = requests[i].prompt prompt, prompt_len, output_len = requests[i]
prompt_len = requests[i].prompt_len
output_len = requests[i].expected_output_len
# Add the prompt to the batch. # Add the prompt to the batch.
batch.append(prompt) batch.append(prompt)
max_prompt_len = max(max_prompt_len, prompt_len) max_prompt_len = max(max_prompt_len, prompt_len)
max_output_len = max(max_output_len, output_len) max_output_len = max(max_output_len, output_len)
if len(batch) < max_batch_size and i != len(requests) - 1: if len(batch) < max_batch_size and i != len(requests) - 1:
# Check if we can add more requests to the batch. # Check if we can add more requests to the batch.
next_prompt_len = requests[i + 1].prompt_len _, next_prompt_len, next_output_len = requests[i + 1]
next_output_len = requests[i + 1].expected_output_len if (max(max_prompt_len, next_prompt_len) +
if ( max(max_output_len, next_output_len)) <= 2048:
max(max_prompt_len, next_prompt_len)
+ max(max_output_len, next_output_len)
) <= 2048:
# We can add more requests to the batch. # We can add more requests to the batch.
continue continue
# Generate the sequences. # Generate the sequences.
input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids input_ids = tokenizer(batch, return_tensors="pt",
padding=True).input_ids
llm_outputs = llm.generate( llm_outputs = llm.generate(
input_ids=input_ids.cuda(), input_ids=input_ids.cuda(),
do_sample=True, do_sample=True,
@ -291,7 +257,6 @@ def run_mii(
output_len: int, output_len: int,
) -> float: ) -> float:
from mii import client, serve from mii import client, serve
llm = serve(model, tensor_parallel=tensor_parallel_size) llm = serve(model, tensor_parallel=tensor_parallel_size)
prompts = [request.prompt for request in requests] prompts = [request.prompt for request in requests]
@ -303,9 +268,8 @@ def run_mii(
return end - start return end - start
def save_to_pytorch_benchmark_format( def save_to_pytorch_benchmark_format(args: argparse.Namespace,
args: argparse.Namespace, results: dict[str, Any] results: dict[str, Any]) -> None:
) -> None:
pt_records = convert_to_pytorch_benchmark_format( pt_records = convert_to_pytorch_benchmark_format(
args=args, args=args,
metrics={ metrics={
@ -313,9 +277,9 @@ def save_to_pytorch_benchmark_format(
"tokens_per_second": [results["tokens_per_second"]], "tokens_per_second": [results["tokens_per_second"]],
}, },
extra_info={ extra_info={
k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"] k: results[k]
}, for k in ["elapsed_time", "num_requests", "total_num_tokens"]
) })
if pt_records: if pt_records:
# Don't use json suffix here as we don't want CI to pick it up # Don't use json suffix here as we don't want CI to pick it up
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
@ -336,7 +300,6 @@ def get_requests(args, tokenizer):
"input_len": args.input_len, "input_len": args.input_len,
"output_len": args.output_len, "output_len": args.output_len,
} }
if args.dataset_path is None or args.dataset_name == "random": if args.dataset_path is None or args.dataset_name == "random":
sample_kwargs["range_ratio"] = args.random_range_ratio sample_kwargs["range_ratio"] = args.random_range_ratio
sample_kwargs["prefix_len"] = args.prefix_len sample_kwargs["prefix_len"] = args.prefix_len
@ -347,31 +310,25 @@ def get_requests(args, tokenizer):
sample_kwargs["enable_multimodal_chat"] = True sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_name == "sonnet": elif args.dataset_name == "sonnet":
assert tokenizer.chat_template or tokenizer.default_chat_template, ( assert tokenizer.chat_template or tokenizer.default_chat_template, (
"Tokenizer/model must have chat template for sonnet dataset." "Tokenizer/model must have chat template for sonnet dataset.")
)
dataset_cls = SonnetDataset dataset_cls = SonnetDataset
sample_kwargs["prefix_len"] = args.prefix_len sample_kwargs["prefix_len"] = args.prefix_len
sample_kwargs["return_prompt_formatted"] = True sample_kwargs["return_prompt_formatted"] = True
elif args.dataset_name == "burstgpt": elif args.dataset_name == "burstgpt":
dataset_cls = BurstGPTDataset dataset_cls = BurstGPTDataset
elif args.dataset_name == "hf": elif args.dataset_name == "hf":
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: if args.backend != "vllm-chat":
dataset_cls = VisionArenaDataset raise ValueError(
common_kwargs["dataset_subset"] = None "hf datasets only are supported by vllm-chat backend")
common_kwargs["dataset_split"] = "train" # Choose between VisionArenaDataset and HuggingFaceDataset based on
sample_kwargs["enable_multimodal_chat"] = True # provided parameters.
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: dataset_cls = (VisionArenaDataset if args.dataset_path
dataset_cls = InstructCoderDataset == VisionArenaDataset.VISION_ARENA_DATASET_PATH
common_kwargs["dataset_split"] = "train" and args.hf_subset is None else HuggingFaceDataset)
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: common_kwargs['dataset_subset'] = args.hf_subset
dataset_cls = ConversationDataset common_kwargs['dataset_split'] = args.hf_split
common_kwargs["dataset_subset"] = args.hf_subset sample_kwargs["enable_multimodal_chat"] = True
common_kwargs["dataset_split"] = args.hf_split
sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
dataset_cls = AIMODataset
common_kwargs["dataset_subset"] = None
common_kwargs["dataset_split"] = "train"
else: else:
raise ValueError(f"Unknown dataset name: {args.dataset_name}") raise ValueError(f"Unknown dataset name: {args.dataset_name}")
# Remove None values # Remove None values
@ -386,10 +343,10 @@ def main(args: argparse.Namespace):
random.seed(args.seed) random.seed(args.seed)
# Sample the requests. # Sample the requests.
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code args.tokenizer, trust_remote_code=args.trust_remote_code)
)
requests = get_requests(args, tokenizer) requests = get_requests(args, tokenizer)
is_multi_modal = any(request.multi_modal_data is not None for request in requests) is_multi_modal = any(request.multi_modal_data is not None
for request in requests)
request_outputs: Optional[list[RequestOutput]] = None request_outputs: Optional[list[RequestOutput]] = None
if args.backend == "vllm": if args.backend == "vllm":
if args.async_engine: if args.async_engine:
@ -400,34 +357,23 @@ def main(args: argparse.Namespace):
AsyncEngineArgs.from_cli_args(args), AsyncEngineArgs.from_cli_args(args),
args.disable_frontend_multiprocessing, args.disable_frontend_multiprocessing,
args.disable_detokenize, args.disable_detokenize,
) ))
)
else: else:
elapsed_time, request_outputs = run_vllm( elapsed_time, request_outputs = run_vllm(
requests, requests, args.n, EngineArgs.from_cli_args(args),
args.n, args.disable_detokenize)
EngineArgs.from_cli_args(args),
args.disable_detokenize,
)
elif args.backend == "hf": elif args.backend == "hf":
assert args.tensor_parallel_size == 1 assert args.tensor_parallel_size == 1
elapsed_time = run_hf( elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
requests, args.hf_max_batch_size, args.trust_remote_code,
args.model, args.disable_detokenize)
tokenizer,
args.n,
args.hf_max_batch_size,
args.trust_remote_code,
args.disable_detokenize,
)
elif args.backend == "mii": elif args.backend == "mii":
elapsed_time = run_mii( elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
requests, args.model, args.tensor_parallel_size, args.output_len args.output_len)
)
elif args.backend == "vllm-chat": elif args.backend == "vllm-chat":
elapsed_time, request_outputs = run_vllm_chat( elapsed_time, request_outputs = run_vllm_chat(
requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize requests, args.n, EngineArgs.from_cli_args(args),
) args.disable_detokenize)
else: else:
raise ValueError(f"Unknown backend: {args.backend}") raise ValueError(f"Unknown backend: {args.backend}")
@ -439,31 +385,28 @@ def main(args: argparse.Namespace):
for ro in request_outputs: for ro in request_outputs:
if not isinstance(ro, RequestOutput): if not isinstance(ro, RequestOutput):
continue continue
total_prompt_tokens += ( total_prompt_tokens += len(
len(ro.prompt_token_ids) if ro.prompt_token_ids else 0 ro.prompt_token_ids) if ro.prompt_token_ids else 0
) total_output_tokens += sum(
total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o) len(o.token_ids) for o in ro.outputs if o)
total_num_tokens = total_prompt_tokens + total_output_tokens total_num_tokens = total_prompt_tokens + total_output_tokens
else: else:
total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests) total_num_tokens = sum(r.prompt_len + r.expected_output_len
for r in requests)
total_output_tokens = sum(r.expected_output_len for r in requests) total_output_tokens = sum(r.expected_output_len for r in requests)
total_prompt_tokens = total_num_tokens - total_output_tokens total_prompt_tokens = total_num_tokens - total_output_tokens
if is_multi_modal and args.backend != "vllm-chat": if is_multi_modal and args.backend != "vllm-chat":
print( print("\033[91mWARNING\033[0m: Multi-modal request with "
"\033[91mWARNING\033[0m: Multi-modal request with " f"{args.backend} backend detected. The "
f"{args.backend} backend detected. The " "following metrics are not accurate because image tokens are not"
"following metrics are not accurate because image tokens are not" " counted. See vllm-project/vllm/issues/9778 for details.")
" counted. See vllm-project/vllm/issues/9778 for details."
)
# TODO(vllm-project/vllm/issues/9778): Count multi-modal token length. # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
# vllm-chat backend counts the image tokens now # vllm-chat backend counts the image tokens now
print( print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
)
print(f"Total num prompt tokens: {total_prompt_tokens}") print(f"Total num prompt tokens: {total_prompt_tokens}")
print(f"Total num output tokens: {total_output_tokens}") print(f"Total num output tokens: {total_output_tokens}")
@ -491,8 +434,7 @@ def validate_args(args):
warnings.warn( warnings.warn(
"The '--dataset' argument will be deprecated in the next release. " "The '--dataset' argument will be deprecated in the next release. "
"Please use '--dataset-name' and '--dataset-path' instead.", "Please use '--dataset-name' and '--dataset-path' instead.",
stacklevel=2, stacklevel=2)
)
args.dataset_path = args.dataset args.dataset_path = args.dataset
if not getattr(args, "tokenizer", None): if not getattr(args, "tokenizer", None):
@ -505,8 +447,9 @@ def validate_args(args):
# === Dataset Configuration === # === Dataset Configuration ===
if not args.dataset and not args.dataset_path: if not args.dataset and not args.dataset_path:
print("When dataset path is not set, it will default to random dataset") print(
args.dataset_name = "random" "When dataset path is not set, it will default to random dataset")
args.dataset_name = 'random'
if args.input_len is None: if args.input_len is None:
raise ValueError("input_len must be provided for a random dataset") raise ValueError("input_len must be provided for a random dataset")
@ -514,55 +457,33 @@ def validate_args(args):
# --hf-subset and --hf-split: only used # --hf-subset and --hf-split: only used
# when dataset_name is 'hf' # when dataset_name is 'hf'
if args.dataset_name != "hf" and ( if args.dataset_name != "hf" and (
getattr(args, "hf_subset", None) is not None getattr(args, "hf_subset", None) is not None
or getattr(args, "hf_split", None) is not None or getattr(args, "hf_split", None) is not None):
): warnings.warn("--hf-subset and --hf-split will be ignored \
warnings.warn(
"--hf-subset and --hf-split will be ignored \
since --dataset-name is not 'hf'.", since --dataset-name is not 'hf'.",
stacklevel=2, stacklevel=2)
) elif args.dataset_name == "hf" and args.backend != "vllm-chat":
elif args.dataset_name == "hf": raise ValueError(
if args.dataset_path in ( "When --dataset-name is 'hf', backend must be 'vllm-chat'")
VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
| ConversationDataset.SUPPORTED_DATASET_PATHS
):
assert args.backend == "vllm-chat", (
f"{args.dataset_path} needs to use vllm-chat as the backend."
) # noqa: E501
elif args.dataset_path in (
InstructCoderDataset.SUPPORTED_DATASET_PATHS
| AIMODataset.SUPPORTED_DATASET_PATHS
):
assert args.backend == "vllm", (
f"{args.dataset_path} needs to use vllm as the backend."
) # noqa: E501
else:
raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
# --random-range-ratio: only used when dataset_name is 'random' # --random-range-ratio: only used when dataset_name is 'random'
if args.dataset_name != "random" and args.random_range_ratio is not None: if args.dataset_name != 'random' and args.random_range_ratio is not None:
warnings.warn( warnings.warn("--random-range-ratio will be ignored since \
"--random-range-ratio will be ignored since \
--dataset-name is not 'random'.", --dataset-name is not 'random'.",
stacklevel=2, stacklevel=2)
)
# --prefix-len: only used when dataset_name is 'random', 'sonnet', or not # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
# set. # set.
if ( if args.dataset_name not in {"random", "sonnet", None
args.dataset_name not in {"random", "sonnet", None} } and args.prefix_len is not None:
and args.prefix_len is not None warnings.warn("--prefix-len will be ignored since --dataset-name\
):
warnings.warn(
"--prefix-len will be ignored since --dataset-name\
is not 'random', 'sonnet', or not set.", is not 'random', 'sonnet', or not set.",
stacklevel=2, stacklevel=2)
)
# === LoRA Settings === # === LoRA Settings ===
if getattr(args, "enable_lora", False) and args.backend != "vllm": if getattr(args, "enable_lora", False) and args.backend != "vllm":
raise ValueError("LoRA benchmarking is only supported for vLLM backend") raise ValueError(
"LoRA benchmarking is only supported for vLLM backend")
if getattr(args, "enable_lora", False) and args.lora_path is None: if getattr(args, "enable_lora", False) and args.lora_path is None:
raise ValueError("LoRA path must be provided when enable_lora is True") raise ValueError("LoRA path must be provided when enable_lora is True")
@ -572,10 +493,8 @@ def validate_args(args):
if args.backend != "hf" and args.hf_max_batch_size is not None: if args.backend != "hf" and args.hf_max_batch_size is not None:
raise ValueError("HF max batch size is only for HF backend.") raise ValueError("HF max batch size is only for HF backend.")
if ( if args.backend in {"hf", "mii"} and getattr(args, "quantization",
args.backend in {"hf", "mii"} None) is not None:
and getattr(args, "quantization", None) is not None
):
raise ValueError("Quantization is only for vLLM backend.") raise ValueError("Quantization is only for vLLM backend.")
if args.backend == "mii" and args.dtype != "auto": if args.backend == "mii" and args.dtype != "auto":
@ -583,32 +502,22 @@ def validate_args(args):
if args.backend == "mii" and args.n != 1: if args.backend == "mii" and args.n != 1:
raise ValueError("n must be 1 for MII backend.") raise ValueError("n must be 1 for MII backend.")
if args.backend == "mii" and args.tokenizer != args.model: if args.backend == "mii" and args.tokenizer != args.model:
raise ValueError("Tokenizer must be the same as the model for MII backend.")
# --data-parallel is not supported currently.
# https://github.com/vllm-project/vllm/issues/16222
if args.data_parallel_size > 1:
raise ValueError( raise ValueError(
"Data parallel is not supported in offline benchmark, \ "Tokenizer must be the same as the model for MII backend.")
please use benchmark serving instead"
)
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument( parser.add_argument("--backend",
"--backend", type=str,
type=str, choices=["vllm", "hf", "mii", "vllm-chat"],
choices=["vllm", "hf", "mii", "vllm-chat"], default="vllm")
default="vllm",
)
parser.add_argument( parser.add_argument(
"--dataset-name", "--dataset-name",
type=str, type=str,
choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"], choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
help="Name of the dataset to benchmark on.", help="Name of the dataset to benchmark on.",
default="sharegpt", default="sharegpt")
)
parser.add_argument( parser.add_argument(
"--dataset", "--dataset",
type=str, type=str,
@ -616,104 +525,80 @@ if __name__ == "__main__":
help="Path to the ShareGPT dataset, will be deprecated in\ help="Path to the ShareGPT dataset, will be deprecated in\
the next release. The dataset is expected to " the next release. The dataset is expected to "
"be a json in form of list[dict[..., conversations: " "be a json in form of list[dict[..., conversations: "
"list[dict[..., value: <prompt_or_response>]]]]", "list[dict[..., value: <prompt_or_response>]]]]")
) parser.add_argument("--dataset-path",
type=str,
default=None,
help="Path to the dataset")
parser.add_argument("--input-len",
type=int,
default=None,
help="Input prompt length for each request")
parser.add_argument("--output-len",
type=int,
default=None,
help="Output length for each request. Overrides the "
"output length from the dataset.")
parser.add_argument("--n",
type=int,
default=1,
help="Number of generated sequences per prompt.")
parser.add_argument("--num-prompts",
type=int,
default=1000,
help="Number of prompts to process.")
parser.add_argument("--hf-max-batch-size",
type=int,
default=None,
help="Maximum batch size for HF backend.")
parser.add_argument( parser.add_argument(
"--dataset-path", type=str, default=None, help="Path to the dataset" '--output-json',
)
parser.add_argument(
"--input-len",
type=int,
default=None,
help="Input prompt length for each request",
)
parser.add_argument(
"--output-len",
type=int,
default=None,
help="Output length for each request. Overrides the "
"output length from the dataset.",
)
parser.add_argument(
"--n", type=int, default=1, help="Number of generated sequences per prompt."
)
parser.add_argument(
"--num-prompts", type=int, default=1000, help="Number of prompts to process."
)
parser.add_argument(
"--hf-max-batch-size",
type=int,
default=None,
help="Maximum batch size for HF backend.",
)
parser.add_argument(
"--output-json",
type=str, type=str,
default=None, default=None,
help="Path to save the throughput results in JSON format.", help='Path to save the throughput results in JSON format.')
) parser.add_argument("--async-engine",
parser.add_argument( action='store_true',
"--async-engine", default=False,
action="store_true", help="Use vLLM async engine rather than LLM class.")
default=False, parser.add_argument("--disable-frontend-multiprocessing",
help="Use vLLM async engine rather than LLM class.", action='store_true',
) default=False,
parser.add_argument( help="Disable decoupled async engine frontend.")
"--disable-frontend-multiprocessing",
action="store_true",
default=False,
help="Disable decoupled async engine frontend.",
)
parser.add_argument( parser.add_argument(
"--disable-detokenize", "--disable-detokenize",
action="store_true", action="store_true",
help=( help=("Do not detokenize the response (i.e. do not include "
"Do not detokenize the response (i.e. do not include " "detokenization time in the measurement)"))
"detokenization time in the measurement)"
),
)
# LoRA # LoRA
parser.add_argument( parser.add_argument(
"--lora-path", "--lora-path",
type=str, type=str,
default=None, default=None,
help="Path to the LoRA adapters to use. This can be an absolute path, " help="Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier.", "a relative path, or a Hugging Face model identifier.")
) parser.add_argument("--prefix-len",
parser.add_argument( type=int,
"--prefix-len", default=None,
type=int, help="Number of prefix tokens per request."
default=None, "This is for the RandomDataset and SonnetDataset")
help=f"Number of prefix tokens to be used in RandomDataset "
"and SonnetDataset. For RandomDataset, the total input "
"length is the sum of prefix-len (default: "
f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
"sampled from [input_len * (1 - range_ratio), "
"input_len * (1 + range_ratio)]. For SonnetDataset, "
f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
"controls how much of the input is fixed lines versus "
"random lines, but the total input length remains approximately "
"input_len tokens.",
)
# random dataset # random dataset
parser.add_argument( parser.add_argument(
"--random-range-ratio", "--random-range-ratio",
type=float, type=float,
default=None, default=None,
help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) " help="Range of sampled ratio of input/output length, "
"for sampling input/output length, " "used only for RandomDataSet.",
"used only for RandomDataset. Must be in the range [0, 1) to "
"define a symmetric sampling range "
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
) )
# hf dtaset # hf dtaset
parser.add_argument( parser.add_argument("--hf-subset",
"--hf-subset", type=str, default=None, help="Subset of the HF dataset." type=str,
) default=None,
parser.add_argument( help="Subset of the HF dataset.")
"--hf-split", type=str, default=None, help="Split of the HF dataset." parser.add_argument("--hf-split",
) type=str,
default=None,
help="Split of the HF dataset.")
parser = AsyncEngineArgs.add_cli_args(parser) parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args() args = parser.parse_args()

View File

@ -7,9 +7,9 @@ import os
from typing import Any from typing import Any
def convert_to_pytorch_benchmark_format( def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any] metrics: dict[str, list],
) -> list: extra_info: dict[str, Any]) -> list:
""" """
Save the benchmark results in the format used by PyTorch OSS benchmark with Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record on metric per record
@ -37,12 +37,12 @@ def convert_to_pytorch_benchmark_format(
}, },
} }
tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size") tp = record["benchmark"]["extra_info"]["args"].get(
"tensor_parallel_size")
# Save tensor_parallel_size parameter if it's part of the metadata # Save tensor_parallel_size parameter if it's part of the metadata
if not tp and "tensor_parallel_size" in extra_info: if not tp and "tensor_parallel_size" in extra_info:
record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = ( record["benchmark"]["extra_info"]["args"][
extra_info["tensor_parallel_size"] "tensor_parallel_size"] = extra_info["tensor_parallel_size"]
)
records.append(record) records.append(record)
@ -50,6 +50,7 @@ def convert_to_pytorch_benchmark_format(
class InfEncoder(json.JSONEncoder): class InfEncoder(json.JSONEncoder):
def clear_inf(self, o: Any): def clear_inf(self, o: Any):
if isinstance(o, dict): if isinstance(o, dict):
return {k: self.clear_inf(v) for k, v in o.items()} return {k: self.clear_inf(v) for k, v in o.items()}

View File

@ -23,9 +23,8 @@ DEFAULT_TP_SIZES = [1]
# bench # bench
def bench_fn( def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
label: str, sub_label: str, description: str, fn: Callable, *args, **kwargs **kwargs) -> TMeasurement:
) -> TMeasurement:
min_run_time = 1 min_run_time = 1
globals = { globals = {
@ -42,18 +41,16 @@ def bench_fn(
).blocked_autorange(min_run_time=min_run_time) ).blocked_autorange(min_run_time=min_run_time)
def bench_int8( def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str sub_label: str) -> Iterable[TMeasurement]:
) -> Iterable[TMeasurement]:
assert dtype == torch.int8 assert dtype == torch.int8
b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k) b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16) bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
out = ops.cutlass_scaled_sparse_mm( out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
a, b_compressed, e, scale_a, scale_b, torch.bfloat16 torch.bfloat16)
)
out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16) out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
if not torch.allclose(out, out_ref): if not torch.allclose(out, out_ref):
@ -66,107 +63,54 @@ def bench_int8(
timers = [] timers = []
# pytorch impl - bfloat16 # pytorch impl - bfloat16
timers.append( timers.append(
bench_fn( bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
label, torch.mm, a.to(dtype=torch.bfloat16),
sub_label, b.to(dtype=torch.bfloat16)))
"pytorch_bf16_bf16_bf16_matmul-no-scales",
torch.mm,
a.to(dtype=torch.bfloat16),
b.to(dtype=torch.bfloat16),
)
)
# pytorch impl - float16 # pytorch impl - float16
timers.append( timers.append(
bench_fn( bench_fn(label, sub_label,
label, "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
sub_label, a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
"pytorch_fp16_fp16_fp16_matmul-no-scales",
torch.mm,
a.to(dtype=torch.float16),
b.to(dtype=torch.float16),
)
)
# cutlass impl # cutlass impl
timers.append( timers.append(
bench_fn( bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
label, ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
sub_label, torch.bfloat16))
"cutlass_i8_i8_bf16_scaled_mm",
ops.cutlass_scaled_mm,
a,
b,
scale_a,
scale_b,
torch.bfloat16,
)
)
# cutlass with bias # cutlass with bias
timers.append( timers.append(
bench_fn( bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
label, ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
sub_label, bias))
"cutlass_i8_i8_bf16_scaled_mm_bias",
ops.cutlass_scaled_mm,
a,
b,
scale_a,
scale_b,
torch.bfloat16,
bias,
)
)
# cutlass sparse impl # cutlass sparse impl
timers.append( timers.append(
bench_fn( bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
label, ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
sub_label, scale_b, torch.bfloat16))
"cutlass_i8_i8_bf16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm,
a,
b_compressed,
e,
scale_a,
scale_b,
torch.bfloat16,
)
)
# cutlass sparse with bias # cutlass sparse with bias
timers.append( timers.append(
bench_fn( bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
label, ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
sub_label, scale_b, torch.bfloat16, bias))
"cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
ops.cutlass_scaled_sparse_mm,
a,
b_compressed,
e,
scale_a,
scale_b,
torch.bfloat16,
bias,
)
)
return timers return timers
def bench_fp8( def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str sub_label: str) -> Iterable[TMeasurement]:
) -> Iterable[TMeasurement]:
assert dtype == torch.float8_e4m3fn assert dtype == torch.float8_e4m3fn
b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k) b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
k)
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16) bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
out = ops.cutlass_scaled_sparse_mm( out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
a, b_compressed, e, scale_a, scale_b, torch.bfloat16 torch.bfloat16)
)
out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16) out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
if not torch.allclose(out, out_ref): if not torch.allclose(out, out_ref):
@ -180,165 +124,97 @@ def bench_fp8(
# pytorch impl w. bf16 # pytorch impl w. bf16
timers.append( timers.append(
bench_fn( bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
label, torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
sub_label, b.to(dtype=torch.bfloat16, device="cuda")))
"pytorch_bf16_bf16_bf16_matmul-no-scales",
torch.mm,
a.to(dtype=torch.bfloat16, device="cuda"),
b.to(dtype=torch.bfloat16, device="cuda"),
)
)
# pytorch impl: bf16 output, without fp8 fast accum # pytorch impl: bf16 output, without fp8 fast accum
timers.append( timers.append(
bench_fn( bench_fn(label,
label, sub_label,
sub_label, "pytorch_fp8_fp8_bf16_scaled_mm",
"pytorch_fp8_fp8_bf16_scaled_mm", torch._scaled_mm,
torch._scaled_mm, a,
a, b,
b, scale_a=scale_a,
scale_a=scale_a, scale_b=scale_b,
scale_b=scale_b, out_dtype=torch.bfloat16))
out_dtype=torch.bfloat16,
)
)
# pytorch impl: bf16 output, with fp8 fast accum # pytorch impl: bf16 output, with fp8 fast accum
timers.append( timers.append(
bench_fn( bench_fn(label,
label, sub_label,
sub_label, "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum", torch._scaled_mm,
torch._scaled_mm, a,
a, b,
b, scale_a=scale_a,
scale_a=scale_a, scale_b=scale_b,
scale_b=scale_b, out_dtype=torch.bfloat16,
out_dtype=torch.bfloat16, use_fast_accum=True))
use_fast_accum=True,
)
)
# pytorch impl: fp16 output, without fp8 fast accum # pytorch impl: fp16 output, without fp8 fast accum
timers.append( timers.append(
bench_fn( bench_fn(label,
label, sub_label,
sub_label, "pytorch_fp8_fp8_fp16_scaled_mm",
"pytorch_fp8_fp8_fp16_scaled_mm", torch._scaled_mm,
torch._scaled_mm, a,
a, b,
b, scale_a=scale_a,
scale_a=scale_a, scale_b=scale_b,
scale_b=scale_b, out_dtype=torch.float16))
out_dtype=torch.float16,
)
)
# pytorch impl: fp16 output, with fp8 fast accum # pytorch impl: fp16 output, with fp8 fast accum
timers.append( timers.append(
bench_fn( bench_fn(label,
label, sub_label,
sub_label, "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum", torch._scaled_mm,
torch._scaled_mm, a,
a, b,
b, scale_a=scale_a,
scale_a=scale_a, scale_b=scale_b,
scale_b=scale_b, out_dtype=torch.float16,
out_dtype=torch.float16, use_fast_accum=True))
use_fast_accum=True,
)
)
# cutlass impl: bf16 output # cutlass impl: bf16 output
timers.append( timers.append(
bench_fn( bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
label, ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
sub_label, torch.bfloat16))
"cutlass_fp8_fp8_bf16_scaled_mm",
ops.cutlass_scaled_mm,
a,
b,
scale_a,
scale_b,
torch.bfloat16,
)
)
# cutlass impl: bf16 output # cutlass impl: bf16 output
timers.append( timers.append(
bench_fn( bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
label, ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
sub_label, scale_b, torch.bfloat16))
"cutlass_fp8_fp8_bf16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm,
a,
b_compressed,
e,
scale_a,
scale_b,
torch.bfloat16,
)
)
# cutlass impl: fp16 output # cutlass impl: fp16 output
timers.append( timers.append(
bench_fn( bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
label, ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
sub_label, scale_b, torch.float16))
"cutlass_fp8_fp8_fp16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm,
a,
b_compressed,
e,
scale_a,
scale_b,
torch.float16,
)
)
# cutlass impl: bf16 output, with bias # cutlass impl: bf16 output, with bias
timers.append( timers.append(
bench_fn( bench_fn(label, sub_label,
label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
sub_label, ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
"cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias", scale_b, torch.bfloat16, bias))
ops.cutlass_scaled_sparse_mm,
a,
b_compressed,
e,
scale_a,
scale_b,
torch.bfloat16,
bias,
)
)
# cutlass impl: fp16 output, with bias # cutlass impl: fp16 output, with bias
timers.append( timers.append(
bench_fn( bench_fn(label, sub_label,
label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
sub_label, ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
"cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias", scale_b, torch.float16, bias.to(dtype=torch.float16)))
ops.cutlass_scaled_sparse_mm,
a,
b_compressed,
e,
scale_a,
scale_b,
torch.float16,
bias.to(dtype=torch.float16),
)
)
return timers return timers
def bench( def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str sub_label: str) -> Iterable[TMeasurement]:
) -> Iterable[TMeasurement]:
if dtype == torch.int8: if dtype == torch.int8:
return bench_int8(dtype, m, k, n, label, sub_label) return bench_int8(dtype, m, k, n, label, sub_label)
if dtype == torch.float8_e4m3fn: if dtype == torch.float8_e4m3fn:
@ -352,12 +228,12 @@ def print_timers(timers: Iterable[TMeasurement]):
compare.print() compare.print()
def run( def run(dtype: torch.dtype,
dtype: torch.dtype, MKNs: Iterable[tuple[int, int, int]] MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]:
) -> Iterable[TMeasurement]:
results = [] results = []
for m, k, n in MKNs: for m, k, n in MKNs:
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm", f"MKN=({m}x{k}x{n})") timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
f"MKN=({m}x{k}x{n})")
print_timers(timers) print_timers(timers)
results.extend(timers) results.extend(timers)
@ -365,12 +241,10 @@ def run(
# output makers # output makers
def make_output( def make_output(data: Iterable[TMeasurement],
data: Iterable[TMeasurement], MKNs: Iterable[tuple[int, int, int]],
MKNs: Iterable[tuple[int, int, int]], base_description: str,
base_description: str, timestamp=None):
timestamp=None,
):
print(f"== All Results {base_description} ====") print(f"== All Results {base_description} ====")
print_timers(data) print_timers(data)
@ -384,7 +258,8 @@ def make_output(
def run_square_bench(args): def run_square_bench(args):
dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment)) dim_sizes = list(
range(args.dim_start, args.dim_end + 1, args.dim_increment))
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
data = run(args.dtype, MKNs) data = run(args.dtype, MKNs)
@ -444,7 +319,7 @@ def run_model_bench(args):
pkl.dump(all_data, f) pkl.dump(all_data, f)
if __name__ == "__main__": if __name__ == '__main__':
def to_torch_dtype(dt): def to_torch_dtype(dt):
if dt == "int8": if dt == "int8":
@ -469,15 +344,12 @@ Benchmark Cutlass GEMM.
Output: Output:
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs. - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
""", # noqa: E501 """, # noqa: E501
formatter_class=argparse.RawTextHelpFormatter, formatter_class=argparse.RawTextHelpFormatter)
)
parser.add_argument( parser.add_argument("--dtype",
"--dtype", type=to_torch_dtype,
type=to_torch_dtype, required=True,
required=True, help="Available options are ['int8', 'fp8']")
help="Available options are ['int8', 'fp8']",
)
subparsers = parser.add_subparsers(dest="cmd") subparsers = parser.add_subparsers(dest="cmd")
square_parser = subparsers.add_parser("square_bench") square_parser = subparsers.add_parser("square_bench")
@ -496,19 +368,19 @@ Benchmark Cutlass GEMM.
range_parser.set_defaults(func=run_range_bench) range_parser.set_defaults(func=run_range_bench)
model_parser = subparsers.add_parser("model_bench") model_parser = subparsers.add_parser("model_bench")
model_parser.add_argument( model_parser.add_argument("--models",
"--models", nargs="+",
nargs="+", type=str,
type=str, default=DEFAULT_MODELS,
default=DEFAULT_MODELS, choices=WEIGHT_SHAPES.keys())
choices=WEIGHT_SHAPES.keys(), model_parser.add_argument("--tp-sizes",
) nargs="+",
model_parser.add_argument( type=int,
"--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES default=DEFAULT_TP_SIZES)
) model_parser.add_argument("--batch-sizes",
model_parser.add_argument( nargs="+",
"--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES type=int,
) default=DEFAULT_BATCH_SIZES)
model_parser.set_defaults(func=run_model_bench) model_parser.set_defaults(func=run_model_bench)
args = parser.parse_args() args = parser.parse_args()

View File

@ -10,9 +10,8 @@ import vllm._custom_ops as ops
def to_fp8(tensor: torch.Tensor) -> torch.Tensor: def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
finfo = torch.finfo(torch.float8_e4m3fn) finfo = torch.finfo(torch.float8_e4m3fn)
return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to( return torch.round(tensor.clamp(
dtype=torch.float8_e4m3fn min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
)
def to_int8(tensor: torch.Tensor) -> torch.Tensor: def to_int8(tensor: torch.Tensor) -> torch.Tensor:
@ -27,11 +26,10 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
return tensor.to(dtype=torch.float16) return tensor.to(dtype=torch.float16)
def make_rand_tensors( def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
dtype: torch.dtype, m: int, n: int, k: int k: int) -> tuple[torch.Tensor, torch.Tensor]:
) -> tuple[torch.Tensor, torch.Tensor]: a = torch.randn((m, k), device='cuda') * 5
a = torch.randn((m, k), device="cuda") * 5 b = torch.randn((n, k), device='cuda').t() * 5
b = torch.randn((n, k), device="cuda").t() * 5
if dtype == torch.int8: if dtype == torch.int8:
return to_int8(a), to_int8(b) return to_int8(a), to_int8(b)
@ -51,7 +49,9 @@ def prune_to_2_4(tensor):
# Create binary mask # Create binary mask
mask = torch.zeros_like(reshaped) mask = torch.zeros_like(reshaped)
mask.scatter_(dim=1, index=indices, src=torch.ones_like(indices, dtype=mask.dtype)) mask.scatter_(dim=1,
index=indices,
src=torch.ones_like(indices, dtype=mask.dtype))
# Apply mask and reshape back # Apply mask and reshape back
pruned = reshaped * mask pruned = reshaped * mask
@ -62,11 +62,10 @@ def prune_to_2_4(tensor):
return pruned.reshape(original_shape) return pruned.reshape(original_shape)
def make_rand_sparse_tensors( def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
dtype: torch.dtype, m: int, n: int, k: int k: int) -> tuple[torch.Tensor, torch.Tensor]:
) -> tuple[torch.Tensor, torch.Tensor]: a = torch.randn((m, k), device='cuda') * 5
a = torch.randn((m, k), device="cuda") * 5 b = torch.randn((n, k), device='cuda').t() * 5
b = torch.randn((n, k), device="cuda").t() * 5
b = prune_to_2_4(b.t()).t() b = prune_to_2_4(b.t()).t()
@ -87,9 +86,9 @@ def make_rand_sparse_tensors(
return b_compressed, e, a, b return b_compressed, e, a, b
def make_n_rand_sparse_tensors( def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int m: int, n: int, k: int) -> \
) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]: tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
ABs = [] ABs = []
for _ in range(num_tensors): for _ in range(num_tensors):
b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k) b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)

View File

@ -16,8 +16,7 @@ from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import ( from vllm.model_executor.layers.quantization.utils.fp8_utils import (
w8a8_block_fp8_matmul, w8a8_block_fp8_matmul)
)
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@ -26,9 +25,8 @@ DEFAULT_TP_SIZES = [1]
# bench # bench
def bench_fn( def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
label: str, sub_label: str, description: str, fn: Callable, *args, **kwargs **kwargs) -> TMeasurement:
) -> TMeasurement:
min_run_time = 1 min_run_time = 1
globals = { globals = {
@ -46,48 +44,45 @@ def bench_fn(
def bench_int8( def bench_int8(
dtype: torch.dtype, dtype: torch.dtype,
m: int, m: int,
k: int, k: int,
n: int, n: int,
label: str, label: str,
sub_label: str, sub_label: str,
bench_kernels: Optional[list[str]] = None, bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
) -> Iterable[TMeasurement]:
"""Benchmark INT8-based kernels.""" """Benchmark INT8-based kernels."""
assert dtype == torch.int8 assert dtype == torch.int8
a, b = make_rand_tensors(torch.int8, m, n, k) a, b = make_rand_tensors(torch.int8, m, n, k)
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16) bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
azp = torch.zeros((m,), device="cuda", dtype=torch.int32) azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
azp_adj = torch.zeros((n,), device="cuda", dtype=torch.int32) azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
bench_fns = { bench_fns = {
"pytorch_bf16_bf16_bf16_matmul-no-scales": lambda: torch.mm( "pytorch_bf16_bf16_bf16_matmul-no-scales":
a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16) lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
), ),
"pytorch_fp16_fp16_fp16_matmul-no-scales": lambda: torch.mm( "pytorch_fp16_fp16_fp16_matmul-no-scales":
a.to(dtype=torch.float16), b.to(dtype=torch.float16) lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
), "cutlass_i8_i8_bf16_scaled_mm":
"cutlass_i8_i8_bf16_scaled_mm": lambda: ops.cutlass_scaled_mm( lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
a, b, scale_a, scale_b, torch.bfloat16 "cutlass_i8_i8_bf16_scaled_mm_bias":
), lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
"cutlass_i8_i8_bf16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm( bias),
a, b, scale_a, scale_b, torch.bfloat16, bias "cutlass_i8_i8_bf16_scaled_mm_azp":
), lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
"cutlass_i8_i8_bf16_scaled_mm_azp": lambda: ops.cutlass_scaled_mm_azp( bfloat16, azp_adj),
a, b, scale_a, scale_b, torch.bfloat16, azp_adj "cutlass_i8_i8_bf16_scaled_mm_azp_bias":
), lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
"cutlass_i8_i8_bf16_scaled_mm_azp_bias": lambda: ops.cutlass_scaled_mm_azp( bfloat16, azp_adj, None, bias),
a, b, scale_a, scale_b, torch.bfloat16, azp_adj, None, bias "cutlass_i8_i8_bf16_scaled_mm_azp_pt":
), lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
"cutlass_i8_i8_bf16_scaled_mm_azp_pt": lambda: ops.cutlass_scaled_mm_azp( bfloat16, azp_adj, azp),
a, b, scale_a, scale_b, torch.bfloat16, azp_adj, azp "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias":
), lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
"cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias": lambda: ops.cutlass_scaled_mm_azp( bfloat16, azp_adj, azp, bias),
a, b, scale_a, scale_b, torch.bfloat16, azp_adj, azp, bias
),
} }
timers = [] timers = []
@ -101,73 +96,73 @@ def bench_int8(
def bench_fp8( def bench_fp8(
dtype: torch.dtype, dtype: torch.dtype,
m: int, m: int,
k: int, k: int,
n: int, n: int,
label: str, label: str,
sub_label: str, sub_label: str,
bench_kernels: Optional[list[str]] = None, bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
) -> Iterable[TMeasurement]:
"""Benchmark FP8-based kernels.""" """Benchmark FP8-based kernels."""
assert dtype == torch.float8_e4m3fn assert dtype == torch.float8_e4m3fn
a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k) a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
a_cont = a.contiguous() a_cont = a.contiguous()
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
block_scale_a = torch.rand((m, k // 128),
def ceil_div(x: int, y: int) -> int: device="cuda",
return (x + y - 1) // y dtype=torch.float32)
block_scale_b = torch.rand((k // 128, n // 128),
block_scale_a = torch.rand( device="cuda",
(m, ceil_div(k, 128)), device="cuda", dtype=torch.float32 dtype=torch.float32)
)
block_scale_b = torch.rand(
ceil_div(k, 128), ceil_div(n, 128), device="cuda", dtype=torch.float32
)
block_scale_a_M_major = block_scale_a.t().contiguous().t() block_scale_a_M_major = block_scale_a.t().contiguous().t()
block_scale_b_K_major = block_scale_b.t().contiguous().t() block_scale_b_K_major = block_scale_b.t().contiguous().t()
bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16) bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
print(m, k, n) print(m, k, n)
bench_fns = { bench_fns = {
"pytorch_bf16_bf16_bf16_matmul-no-scales": lambda: torch.mm( "pytorch_bf16_bf16_bf16_matmul-no-scales":
a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16) lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
), ),
"pytorch_fp16_fp16_fp16_matmul-no-scales": lambda: torch.mm( "pytorch_fp16_fp16_fp16_matmul-no-scales":
a.to(dtype=torch.float16), b.to(dtype=torch.float16) lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
), "pytorch_fp8_fp8_fp16_scaled_mm":
"pytorch_fp8_fp8_fp16_scaled_mm": lambda: torch._scaled_mm( lambda: torch._scaled_mm(
a, b, scale_a, scale_b, out_dtype=torch.float16 a, b, scale_a, scale_b, out_dtype=torch.float16),
), "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum":
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum": lambda: torch._scaled_mm( lambda: torch._scaled_mm(a,
a, b, scale_a, scale_b, out_dtype=torch.float16, use_fast_accum=True b,
), scale_a,
"pytorch_fp8_fp8_bf16_scaled_mm": lambda: torch._scaled_mm( scale_b,
a, b, scale_a, scale_b, out_dtype=torch.bfloat16 out_dtype=torch.float16,
), use_fast_accum=True),
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum": lambda: torch._scaled_mm( "pytorch_fp8_fp8_bf16_scaled_mm":
a, b, scale_a, scale_b, out_dtype=torch.bfloat16, use_fast_accum=True lambda: torch._scaled_mm(
), a, b, scale_a, scale_b, out_dtype=torch.bfloat16),
"cutlass_fp8_fp8_bf16_scaled_mm": lambda: ops.cutlass_scaled_mm( "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum":
a, b, scale_a, scale_b, torch.bfloat16 lambda: torch._scaled_mm(a,
), b,
"cutlass_fp8_fp8_fp16_scaled_mm": lambda: ops.cutlass_scaled_mm( scale_a,
a, b, scale_a, scale_b, torch.float16 scale_b,
), out_dtype=torch.bfloat16,
"cutlass_fp8_fp8_bf16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm( use_fast_accum=True),
a, b, scale_a, scale_b, torch.bfloat16, bias "cutlass_fp8_fp8_bf16_scaled_mm":
), lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm( "cutlass_fp8_fp8_fp16_scaled_mm":
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16) lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16),
), "cutlass_fp8_fp8_bf16_scaled_mm_bias":
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul( lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128) bias),
), "cutlass_fp8_fp8_fp16_scaled_mm_bias":
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm( lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16,
a, b, block_scale_a_M_major, block_scale_b_K_major, torch.float16 bias.to(dtype=torch.float16)),
), "triton_fp8_fp8_fp16_scaled_mm_blockwise":
lambda: w8a8_block_fp8_matmul(a_cont, b.t(), block_scale_a,
block_scale_b.t(), (128, 128)),
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise":
lambda: ops.cutlass_scaled_mm(a, b, block_scale_a_M_major,
block_scale_b_K_major, torch.float16),
} }
timers = [] timers = []
@ -180,15 +175,13 @@ def bench_fp8(
return timers return timers
def bench( def bench(dtype: torch.dtype,
dtype: torch.dtype, m: int,
m: int, k: int,
k: int, n: int,
n: int, label: str,
label: str, sub_label: str,
sub_label: str, bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
bench_kernels: Optional[list[str]] = None,
) -> Iterable[TMeasurement]:
if dtype == torch.int8: if dtype == torch.int8:
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels) return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
if dtype == torch.float8_e4m3fn: if dtype == torch.float8_e4m3fn:
@ -202,33 +195,27 @@ def print_timers(timers: Iterable[TMeasurement]):
compare.print() compare.print()
def run( def run(dtype: torch.dtype,
dtype: torch.dtype, MKNs: Iterable[tuple[int, int, int]],
MKNs: Iterable[tuple[int, int, int]], bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
bench_kernels: Optional[list[str]] = None,
) -> Iterable[TMeasurement]:
results = [] results = []
for m, k, n in MKNs: for m, k, n in MKNs:
timers = bench( timers = bench(dtype,
dtype, m,
m, k,
k, n,
n, f"scaled-{dtype}-gemm",
f"scaled-{dtype}-gemm", f"MKN=({m}x{k}x{n})",
f"MKN=({m}x{k}x{n})", bench_kernels=bench_kernels)
bench_kernels=bench_kernels,
)
print_timers(timers) print_timers(timers)
results.extend(timers) results.extend(timers)
return results return results
def make_output( def make_output(data: Iterable[TMeasurement],
data: Iterable[TMeasurement], MKNs: Iterable[tuple[int, int, int]],
MKNs: Iterable[tuple[int, int, int]], base_description: str,
base_description: str, timestamp=None):
timestamp=None,
):
print(f"== All Results {base_description} ====") print(f"== All Results {base_description} ====")
print_timers(data) print_timers(data)
@ -239,7 +226,8 @@ def make_output(
def run_square_bench(args): def run_square_bench(args):
dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment)) dim_sizes = list(
range(args.dim_start, args.dim_end + 1, args.dim_increment))
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
data = run(args.dtype, MKNs, bench_kernels=args.kernels) data = run(args.dtype, MKNs, bench_kernels=args.kernels)
make_output(data, MKNs, f"square_bench-{args.dtype}") make_output(data, MKNs, f"square_bench-{args.dtype}")
@ -297,7 +285,7 @@ def run_model_bench(args):
pkl.dump(all_data, f) pkl.dump(all_data, f)
if __name__ == "__main__": if __name__ == '__main__':
def to_torch_dtype(dt): def to_torch_dtype(dt):
if dt == "int8": if dt == "int8":
@ -322,21 +310,19 @@ Benchmark Cutlass GEMM.
Output: Output:
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs. - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
""", # noqa: E501 """, # noqa: E501
formatter_class=argparse.RawTextHelpFormatter, formatter_class=argparse.RawTextHelpFormatter)
)
parser.add_argument( parser.add_argument("--dtype",
"--dtype", type=to_torch_dtype,
type=to_torch_dtype, required=True,
required=True, help="Available options are ['int8', 'fp8']")
help="Available options are ['int8', 'fp8']",
)
parser.add_argument( parser.add_argument(
"--kernels", "--kernels",
nargs="+", nargs="+",
type=str, type=str,
default=None, default=None,
help="Exact names of the kernels to benchmark. If not set, runs all kernels.", help=
"Exact names of the kernels to benchmark. If not set, runs all kernels."
) )
subparsers = parser.add_subparsers(dest="cmd") subparsers = parser.add_subparsers(dest="cmd")
@ -357,19 +343,19 @@ Benchmark Cutlass GEMM.
range_parser.set_defaults(func=run_range_bench) range_parser.set_defaults(func=run_range_bench)
model_parser = subparsers.add_parser("model_bench") model_parser = subparsers.add_parser("model_bench")
model_parser.add_argument( model_parser.add_argument("--models",
"--models", nargs="+",
nargs="+", type=str,
type=str, default=DEFAULT_MODELS,
default=DEFAULT_MODELS, choices=WEIGHT_SHAPES.keys())
choices=WEIGHT_SHAPES.keys(), model_parser.add_argument("--tp-sizes",
) nargs="+",
model_parser.add_argument( type=int,
"--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES default=DEFAULT_TP_SIZES)
) model_parser.add_argument("--batch-sizes",
model_parser.add_argument( nargs="+",
"--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES type=int,
) default=DEFAULT_BATCH_SIZES)
model_parser.set_defaults(func=run_model_bench) model_parser.set_defaults(func=run_model_bench)
args = parser.parse_args() args = parser.parse_args()

View File

@ -42,4 +42,4 @@ WEIGHT_SHAPES = {
([8192, 57344], 1), ([8192, 57344], 1),
([28672, 8192], 0), ([28672, 8192], 0),
], ],
} }

View File

@ -12,37 +12,39 @@ app = Quart(__name__)
async def forward_request(url, data): async def forward_request(url, data):
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} headers = {
async with session.post(url=url, json=data, headers=headers) as response: "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
async with session.post(url=url, json=data,
headers=headers) as response:
if response.status == 200: if response.status == 200:
# if response.headers.get('Transfer-Encoding') == 'chunked': # if response.headers.get('Transfer-Encoding') == 'chunked':
if True: if True:
async for chunk_bytes in response.content.iter_chunked(1024): async for chunk_bytes in response.content.iter_chunked(
1024):
yield chunk_bytes yield chunk_bytes
else: else:
content = await response.read() content = await response.read()
yield content yield content
@app.route("/v1/completions", methods=["POST"]) @app.route('/v1/completions', methods=['POST'])
async def handle_request(): async def handle_request():
try: try:
original_request_data = await request.get_json() original_request_data = await request.get_json()
prefill_request = original_request_data.copy() prefill_request = original_request_data.copy()
# change max_tokens = 1 to let it only do prefill # change max_tokens = 1 to let it only do prefill
prefill_request["max_tokens"] = 1 prefill_request['max_tokens'] = 1
# finish prefill # finish prefill
async for _ in forward_request( async for _ in forward_request('http://localhost:8100/v1/completions',
"http://localhost:8100/v1/completions", prefill_request prefill_request):
):
continue continue
# return decode # return decode
generator = forward_request( generator = forward_request('http://localhost:8200/v1/completions',
"http://localhost:8200/v1/completions", original_request_data original_request_data)
)
response = await make_response(generator) response = await make_response(generator)
response.timeout = None response.timeout = None
@ -51,12 +53,11 @@ async def handle_request():
except Exception as e: except Exception as e:
import sys import sys
import traceback import traceback
exc_info = sys.exc_info() exc_info = sys.exc_info()
print("Error occurred in disagg prefill proxy server") print("Error occurred in disagg prefill proxy server")
print(e) print(e)
print("".join(traceback.format_exception(*exc_info))) print("".join(traceback.format_exception(*exc_info)))
if __name__ == "__main__": if __name__ == '__main__':
app.run(port=8000) app.run(port=8000)

View File

@ -8,6 +8,7 @@ from aiohttp import web
class RoundRobinProxy: class RoundRobinProxy:
def __init__(self, target_ports): def __init__(self, target_ports):
self.target_ports = target_ports self.target_ports = target_ports
self.port_cycle = itertools.cycle(self.target_ports) self.port_cycle = itertools.cycle(self.target_ports)
@ -20,15 +21,14 @@ class RoundRobinProxy:
try: try:
# Forward the request # Forward the request
async with session.request( async with session.request(
method=request.method, method=request.method,
url=target_url, url=target_url,
headers=request.headers, headers=request.headers,
data=request.content, data=request.content,
) as response: ) as response:
# Start sending the response # Start sending the response
resp = web.StreamResponse( resp = web.StreamResponse(status=response.status,
status=response.status, headers=response.headers headers=response.headers)
)
await resp.prepare(request) await resp.prepare(request)
# Stream the response content # Stream the response content
@ -45,11 +45,11 @@ class RoundRobinProxy:
async def main(): async def main():
proxy = RoundRobinProxy([8100, 8200]) proxy = RoundRobinProxy([8100, 8200])
app = web.Application() app = web.Application()
app.router.add_route("*", "/{path:.*}", proxy.handle_request) app.router.add_route('*', '/{path:.*}', proxy.handle_request)
runner = web.AppRunner(app) runner = web.AppRunner(app)
await runner.setup() await runner.setup()
site = web.TCPSite(runner, "localhost", 8000) site = web.TCPSite(runner, 'localhost', 8000)
await site.start() await site.start()
print("Proxy server started on http://localhost:8000") print("Proxy server started on http://localhost:8000")
@ -58,5 +58,5 @@ async def main():
await asyncio.Event().wait() await asyncio.Event().wait()
if __name__ == "__main__": if __name__ == '__main__':
asyncio.run(main()) asyncio.run(main())

Some files were not shown because too many files have changed in this diff Show More