Compare commits
12 Commits
v0.9.0.1
...
mla_cuda_g
| Author | SHA1 | Date | |
|---|---|---|---|
| 0a02744dc8 | |||
| 984ffddda6 | |||
| 135c404fbb | |||
| 7241acbd64 | |||
| 2b140debbb | |||
| 2326814c11 | |||
| 534cd0006d | |||
| aa19f297d2 | |||
| 4880a43d20 | |||
| 3895bba85a | |||
| f23d126a07 | |||
| ec8c1cf732 |
@ -1,19 +1,17 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
|
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB
|
||||||
# Note that we have 400 MiB quota, please use it wisely.
|
# Note that we have 400 MiB quota, please use it wisely.
|
||||||
# See https://github.com/pypi/support/issues/3792 .
|
# See https://github.com/pypi/support/issues/3792 .
|
||||||
# Please also sync the value with the one in Dockerfile.
|
# Please also sync the value with the one in Dockerfile.
|
||||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
|
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300))
|
||||||
|
|
||||||
|
|
||||||
def print_top_10_largest_files(zip_file):
|
def print_top_10_largest_files(zip_file):
|
||||||
"""Print the top 10 largest files in the given zip file."""
|
"""Print the top 10 largest files in the given zip file."""
|
||||||
with zipfile.ZipFile(zip_file, "r") as z:
|
with zipfile.ZipFile(zip_file, 'r') as z:
|
||||||
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
||||||
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
||||||
for f, size in file_sizes[:10]:
|
for f, size in file_sizes[:10]:
|
||||||
@ -28,18 +26,14 @@ def check_wheel_size(directory):
|
|||||||
wheel_path = os.path.join(root, file_name)
|
wheel_path = os.path.join(root, file_name)
|
||||||
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
|
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
|
||||||
if wheel_size_mb > VLLM_MAX_SIZE_MB:
|
if wheel_size_mb > VLLM_MAX_SIZE_MB:
|
||||||
print(
|
print(f"Not allowed: Wheel {wheel_path} is larger "
|
||||||
f"Not allowed: Wheel {wheel_path} is larger "
|
f"({wheel_size_mb:.2f} MB) than the limit "
|
||||||
f"({wheel_size_mb:.2f} MB) than the limit "
|
f"({VLLM_MAX_SIZE_MB} MB).")
|
||||||
f"({VLLM_MAX_SIZE_MB} MB)."
|
|
||||||
)
|
|
||||||
print_top_10_largest_files(wheel_path)
|
print_top_10_largest_files(wheel_path)
|
||||||
return 1
|
return 1
|
||||||
else:
|
else:
|
||||||
print(
|
print(f"Wheel {wheel_path} is within the allowed size "
|
||||||
f"Wheel {wheel_path} is within the allowed size "
|
f"({wheel_size_mb:.2f} MB).")
|
||||||
f"({wheel_size_mb:.2f} MB)."
|
|
||||||
)
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
@ -49,4 +43,4 @@ if __name__ == "__main__":
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
directory = sys.argv[1]
|
directory = sys.argv[1]
|
||||||
sys.exit(check_wheel_size(directory))
|
sys.exit(check_wheel_size(directory))
|
||||||
@ -1,5 +1,3 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
|
|
||||||
@ -22,5 +20,5 @@ with open("index.html", "w") as f:
|
|||||||
print(f"Generated index.html for {args.wheel}")
|
print(f"Generated index.html for {args.wheel}")
|
||||||
# cloudfront requires escaping the '+' character
|
# cloudfront requires escaping the '+' character
|
||||||
f.write(
|
f.write(
|
||||||
template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
|
template.format(wheel=filename,
|
||||||
)
|
wheel_html_escaped=filename.replace("+", "%2B")))
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
||||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
||||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
|
|
||||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,11 +0,0 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
|
|
||||||
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.335
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.323
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,12 +1,11 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "mgoin/Minitron-4B-Base-FP8"
|
model_name: "mgoin/Minitron-4B-Base-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
metrics:
|
metrics:
|
||||||
- name: "exact_match,strict-match"
|
- name: "exact_match,strict-match"
|
||||||
value: 0.231
|
value: 0.233
|
||||||
- name: "exact_match,flexible-extract"
|
- name: "exact_match,flexible-extract"
|
||||||
value: 0.22
|
value: 0.236
|
||||||
limit: 1000
|
limit: 1000
|
||||||
num_fewshot: 5
|
num_fewshot: 5
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
||||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
||||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
|
|
||||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
|
|
||||||
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.30
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.465
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
||||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,11 +0,0 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
|
|
||||||
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.54
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.59
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,11 +0,0 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
|
||||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.47
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.64
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
|
||||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.6353
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.637
|
|
||||||
limit: null
|
|
||||||
num_fewshot: null
|
|
||||||
@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
|
|||||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||||
Qwen2-57B-A14-Instruct.yaml
|
Qwen2-57B-A14-Instruct.yaml
|
||||||
DeepSeek-V2-Lite-Chat.yaml
|
DeepSeek-V2-Lite-Chat.yaml
|
||||||
Meta-Llama-3-8B-QQQ.yaml
|
|
||||||
|
|||||||
@ -1,6 +1,10 @@
|
|||||||
Qwen2.5-1.5B-Instruct.yaml
|
Meta-Llama-3-8B-Instruct.yaml
|
||||||
|
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
||||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
||||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
|
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
||||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
Minitron-4B-Base-FP8.yaml
|
||||||
|
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
|
||||||
|
Qwen2-1.5B-Instruct-FP8W8.yaml
|
||||||
|
Meta-Llama-3-8B-QQQ.yaml
|
||||||
|
|||||||
@ -1,43 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_addoption(parser):
|
|
||||||
parser.addoption(
|
|
||||||
"--config-list-file",
|
|
||||||
action="store",
|
|
||||||
help="Path to the file listing model config YAMLs (one per line)",
|
|
||||||
)
|
|
||||||
parser.addoption(
|
|
||||||
"--tp-size",
|
|
||||||
action="store",
|
|
||||||
default="1",
|
|
||||||
help="Tensor parallel size to use for evaluation",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def config_list_file(pytestconfig, config_dir):
|
|
||||||
rel_path = pytestconfig.getoption("--config-list-file")
|
|
||||||
return config_dir / rel_path
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def tp_size(pytestconfig):
|
|
||||||
return pytestconfig.getoption("--tp-size")
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_generate_tests(metafunc):
|
|
||||||
if "config_filename" in metafunc.fixturenames:
|
|
||||||
rel_path = metafunc.config.getoption("--config-list-file")
|
|
||||||
config_list_file = Path(rel_path).resolve()
|
|
||||||
config_dir = config_list_file.parent
|
|
||||||
with open(config_list_file, encoding="utf-8") as f:
|
|
||||||
configs = [
|
|
||||||
config_dir / line.strip()
|
|
||||||
for line in f
|
|
||||||
if line.strip() and not line.startswith("#")
|
|
||||||
]
|
|
||||||
metafunc.parametrize("config_filename", configs)
|
|
||||||
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on GSM8k using vllm and compares to "
|
||||||
|
echo "precomputed baseline (measured by HF transformers.)"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
|
||||||
|
echo " -t - tensor parallel size"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
SUCCESS=0
|
||||||
|
|
||||||
|
while getopts "c:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
c )
|
||||||
|
CONFIG="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Parse list of configs.
|
||||||
|
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
|
||||||
|
|
||||||
|
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
||||||
|
do
|
||||||
|
LOCAL_SUCCESS=0
|
||||||
|
|
||||||
|
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
|
||||||
|
|
||||||
|
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
|
||||||
|
export LM_EVAL_TP_SIZE=$TP_SIZE
|
||||||
|
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
|
||||||
|
|
||||||
|
if [[ $LOCAL_SUCCESS == 0 ]]; then
|
||||||
|
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
else
|
||||||
|
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
fi
|
||||||
|
|
||||||
|
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "${SUCCESS}" -eq "0" ]; then
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
@ -1,54 +1,63 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
"""
|
"""
|
||||||
LM eval harness on model to compare vs HF baseline computed offline.
|
LM eval harness on model to compare vs HF baseline computed offline.
|
||||||
Configs are found in configs/$MODEL.yaml
|
Configs are found in configs/$MODEL.yaml
|
||||||
|
|
||||||
pytest -s -v test_lm_eval_correctness.py \
|
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
|
||||||
--config-list-file=configs/models-small.txt \
|
* export LM_EVAL_TP_SIZE=4
|
||||||
--tp-size=1
|
* pytest -s test_lm_eval_correctness.py
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import numpy as np
|
import numpy
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
RTOL = 0.08
|
RTOL = 0.05
|
||||||
|
TEST_DATA_FILE = os.environ.get(
|
||||||
|
"LM_EVAL_TEST_DATA_FILE",
|
||||||
|
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
|
||||||
|
|
||||||
|
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
|
||||||
|
|
||||||
|
|
||||||
def launch_lm_eval(eval_config, tp_size):
|
def launch_lm_eval(eval_config):
|
||||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
trust_remote_code = eval_config.get('trust_remote_code', False)
|
||||||
model_args = (
|
|
||||||
f"pretrained={eval_config['model_name']},"
|
model_args = f"pretrained={eval_config['model_name']}," \
|
||||||
f"tensor_parallel_size={tp_size},"
|
f"tensor_parallel_size={TP_SIZE}," \
|
||||||
f"enforce_eager=true,"
|
f"add_bos_token=true," \
|
||||||
f"add_bos_token=true,"
|
f"trust_remote_code={trust_remote_code}"
|
||||||
f"trust_remote_code={trust_remote_code}"
|
|
||||||
)
|
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
model="vllm",
|
model="vllm",
|
||||||
model_args=model_args,
|
model_args=model_args,
|
||||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
num_fewshot=eval_config["num_fewshot"],
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
limit=eval_config["limit"],
|
limit=eval_config["limit"],
|
||||||
batch_size="auto",
|
batch_size="auto")
|
||||||
)
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def test_lm_eval_correctness_param(config_filename, tp_size):
|
def test_lm_eval_correctness():
|
||||||
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
eval_config = yaml.safe_load(
|
||||||
|
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
||||||
|
|
||||||
results = launch_lm_eval(eval_config, tp_size)
|
# Launch eval requests.
|
||||||
|
results = launch_lm_eval(eval_config)
|
||||||
|
|
||||||
|
# Confirm scores match ground truth.
|
||||||
success = True
|
success = True
|
||||||
for task in eval_config["tasks"]:
|
for task in eval_config["tasks"]:
|
||||||
for metric in task["metrics"]:
|
for metric in task["metrics"]:
|
||||||
ground_truth = metric["value"]
|
ground_truth = metric["value"]
|
||||||
measured_value = results["results"][task["name"]][metric["name"]]
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
print(
|
print(f'{task["name"]} | {metric["name"]}: '
|
||||||
f"{task['name']} | {metric['name']}: "
|
f'ground_truth={ground_truth} | measured={measured_value}')
|
||||||
f"ground_truth={ground_truth} | measured={measured_value}"
|
success = success and numpy.isclose(
|
||||||
)
|
ground_truth, measured_value, rtol=RTOL)
|
||||||
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
|
|
||||||
|
|
||||||
|
# Assert at the end, print all scores even on failure for debugging.
|
||||||
assert success
|
assert success
|
||||||
|
|||||||
@ -1,13 +1,15 @@
|
|||||||
# vLLM benchmark suite
|
# vLLM benchmark suite
|
||||||
|
|
||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
This directory contains two sets of benchmark for vllm.
|
This directory contains two sets of benchmark for vllm.
|
||||||
|
|
||||||
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
||||||
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
||||||
|
|
||||||
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
|
||||||
|
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
||||||
|
|
||||||
|
|
||||||
## Performance benchmark quick overview
|
## Performance benchmark quick overview
|
||||||
|
|
||||||
@ -17,14 +19,17 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc
|
|||||||
|
|
||||||
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
||||||
|
|
||||||
|
|
||||||
## Nightly benchmark quick overview
|
## Nightly benchmark quick overview
|
||||||
|
|
||||||
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
||||||
|
|
||||||
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
|
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
|
||||||
|
|
||||||
**Benchmarking Duration**: about 3.5hrs.
|
**Benchmarking Duration**: about 3.5hrs.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Trigger the benchmark
|
## Trigger the benchmark
|
||||||
|
|
||||||
Performance benchmark will be triggered when:
|
Performance benchmark will be triggered when:
|
||||||
@ -34,11 +39,16 @@ Performance benchmark will be triggered when:
|
|||||||
Nightly benchmark will be triggered when:
|
Nightly benchmark will be triggered when:
|
||||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Performance benchmark details
|
## Performance benchmark details
|
||||||
|
|
||||||
|
|
||||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
|
|
||||||
### Latency test
|
|
||||||
|
#### Latency test
|
||||||
|
|
||||||
Here is an example of one test inside `latency-tests.json`:
|
Here is an example of one test inside `latency-tests.json`:
|
||||||
|
|
||||||
@ -58,25 +68,23 @@ Here is an example of one test inside `latency-tests.json`:
|
|||||||
```
|
```
|
||||||
|
|
||||||
In this example:
|
In this example:
|
||||||
|
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||||
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||||
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
|
||||||
|
|
||||||
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
||||||
|
|
||||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
||||||
|
|
||||||
### Throughput test
|
|
||||||
|
|
||||||
|
#### Throughput test
|
||||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
||||||
|
|
||||||
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
||||||
|
|
||||||
### Serving test
|
#### Serving test
|
||||||
|
|
||||||
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||||
|
|
||||||
```json
|
```
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
@ -101,7 +109,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
|
|||||||
```
|
```
|
||||||
|
|
||||||
Inside this example:
|
Inside this example:
|
||||||
|
|
||||||
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
||||||
- The `server-parameters` includes the command line arguments for vLLM server.
|
- The `server-parameters` includes the command line arguments for vLLM server.
|
||||||
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
||||||
@ -111,33 +118,36 @@ The number of this test is less stable compared to the delay and latency benchma
|
|||||||
|
|
||||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||||
|
|
||||||
### Visualizing the results
|
#### Visualizing the results
|
||||||
|
|
||||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
||||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||||
If you do not see the table, please wait till the benchmark finish running.
|
If you do not see the table, please wait till the benchmark finish running.
|
||||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Nightly test details
|
## Nightly test details
|
||||||
|
|
||||||
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
||||||
|
|
||||||
### Workflow
|
|
||||||
|
|
||||||
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
#### Workflow
|
||||||
|
|
||||||
|
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
||||||
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
|
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
|
||||||
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
|
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
|
||||||
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
||||||
|
|
||||||
### Nightly tests
|
#### Nightly tests
|
||||||
|
|
||||||
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
||||||
|
|
||||||
### Docker containers
|
#### Docker containers
|
||||||
|
|
||||||
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
||||||
|
|
||||||
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
|
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
|
||||||
|
|
||||||
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
||||||
|
|
||||||
|
|||||||
@ -10,18 +10,12 @@ steps:
|
|||||||
- image: badouralix/curl-jq
|
- image: badouralix/curl-jq
|
||||||
command:
|
command:
|
||||||
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||||
- label: "Cleanup H100"
|
|
||||||
agents:
|
|
||||||
queue: H100
|
|
||||||
depends_on: ~
|
|
||||||
command: docker system prune -a --volumes --force
|
|
||||||
|
|
||||||
- label: "A100"
|
- label: "A100"
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
agents:
|
agents:
|
||||||
queue: A100
|
queue: A100
|
||||||
depends_on: wait-for-container-image
|
depends_on: wait-for-container-image
|
||||||
if: build.branch == "main"
|
|
||||||
plugins:
|
plugins:
|
||||||
- kubernetes:
|
- kubernetes:
|
||||||
podSpec:
|
podSpec:
|
||||||
@ -56,7 +50,6 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: H200
|
queue: H200
|
||||||
depends_on: wait-for-container-image
|
depends_on: wait-for-container-image
|
||||||
if: build.branch == "main"
|
|
||||||
plugins:
|
plugins:
|
||||||
- docker#v5.12.0:
|
- docker#v5.12.0:
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||||
@ -82,7 +75,6 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: H100
|
queue: H100
|
||||||
depends_on: wait-for-container-image
|
depends_on: wait-for-container-image
|
||||||
if: build.branch == "main"
|
|
||||||
plugins:
|
plugins:
|
||||||
- docker#v5.12.0:
|
- docker#v5.12.0:
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||||
@ -98,87 +90,3 @@ steps:
|
|||||||
environment:
|
environment:
|
||||||
- VLLM_USAGE_SOURCE
|
- VLLM_USAGE_SOURCE
|
||||||
- HF_TOKEN
|
- HF_TOKEN
|
||||||
|
|
||||||
# Premerge benchmark
|
|
||||||
- label: "A100"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch != "main"
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
priorityClassName: perf-benchmark
|
|
||||||
containers:
|
|
||||||
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
nvidia.com/gpu: 8
|
|
||||||
volumeMounts:
|
|
||||||
- name: devshm
|
|
||||||
mountPath: /dev/shm
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
nodeSelector:
|
|
||||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
|
||||||
volumes:
|
|
||||||
- name: devshm
|
|
||||||
emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
|
|
||||||
- label: "H200"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: H200
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch != "main"
|
|
||||||
plugins:
|
|
||||||
- docker#v5.12.0:
|
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash
|
|
||||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
mount-buildkite-agent: true
|
|
||||||
propagate-environment: true
|
|
||||||
ipc: host
|
|
||||||
gpus: 4,5,6,7
|
|
||||||
volumes:
|
|
||||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
|
||||||
environment:
|
|
||||||
- VLLM_USAGE_SOURCE
|
|
||||||
- HF_TOKEN
|
|
||||||
|
|
||||||
#- block: "Run H100 Benchmark"
|
|
||||||
#key: block-h100
|
|
||||||
#depends_on: ~
|
|
||||||
|
|
||||||
- label: "H100"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: H100
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch != "main"
|
|
||||||
plugins:
|
|
||||||
- docker#v5.12.0:
|
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash
|
|
||||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
mount-buildkite-agent: true
|
|
||||||
propagate-environment: true
|
|
||||||
ipc: host
|
|
||||||
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
|
||||||
volumes:
|
|
||||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
|
||||||
environment:
|
|
||||||
- VLLM_USAGE_SOURCE
|
|
||||||
- HF_TOKEN
|
|
||||||
|
|||||||
@ -9,19 +9,20 @@ This file contains the downloading link for benchmarking results.
|
|||||||
|
|
||||||
Please download the visualization scripts in the post
|
Please download the visualization scripts in the post
|
||||||
|
|
||||||
|
|
||||||
## Results reproduction
|
## Results reproduction
|
||||||
|
|
||||||
- Find the docker we use in `benchmarking pipeline`
|
- Find the docker we use in `benchmarking pipeline`
|
||||||
- Deploy the docker, and inside the docker:
|
- Deploy the docker, and inside the docker:
|
||||||
- Download `nightly-benchmarks.zip`.
|
- Download `nightly-benchmarks.zip`.
|
||||||
- In the same folder, run the following code:
|
- In the same folder, run the following code
|
||||||
|
```
|
||||||
```console
|
export HF_TOKEN=<your HF token>
|
||||||
export HF_TOKEN=<your HF token>
|
apt update
|
||||||
apt update
|
apt install -y git
|
||||||
apt install -y git
|
unzip nightly-benchmarks.zip
|
||||||
unzip nightly-benchmarks.zip
|
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||||
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
```
|
||||||
```
|
|
||||||
|
|
||||||
And the results will be inside `./benchmarks/results`.
|
And the results will be inside `./benchmarks/results`.
|
||||||
|
|
||||||
|
|||||||
@ -2,7 +2,6 @@
|
|||||||
# Nightly benchmark
|
# Nightly benchmark
|
||||||
|
|
||||||
This benchmark aims to:
|
This benchmark aims to:
|
||||||
|
|
||||||
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
||||||
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
||||||
|
|
||||||
@ -10,6 +9,7 @@ Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html)
|
|||||||
|
|
||||||
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||||
|
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
- Docker images:
|
- Docker images:
|
||||||
@ -33,7 +33,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
|
|||||||
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
||||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||||
|
|
||||||
## Known issues
|
# Known issues
|
||||||
|
|
||||||
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
||||||
- TGI does not support `ignore-eos` flag.
|
- TGI does not support `ignore-eos` flag.
|
||||||
@ -7,8 +7,10 @@
|
|||||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||||
|
|
||||||
|
|
||||||
{latency_tests_markdown_table}
|
{latency_tests_markdown_table}
|
||||||
|
|
||||||
|
|
||||||
## Throughput tests
|
## Throughput tests
|
||||||
|
|
||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
@ -17,8 +19,10 @@
|
|||||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- Evaluation metrics: throughput.
|
- Evaluation metrics: throughput.
|
||||||
|
|
||||||
|
|
||||||
{throughput_tests_markdown_table}
|
{throughput_tests_markdown_table}
|
||||||
|
|
||||||
|
|
||||||
## Serving tests
|
## Serving tests
|
||||||
|
|
||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
@ -29,11 +33,13 @@
|
|||||||
- We also added a speculative decoding test for llama-3 70B, under QPS 2
|
- We also added a speculative decoding test for llama-3 70B, under QPS 2
|
||||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||||
|
|
||||||
|
|
||||||
{serving_tests_markdown_table}
|
{serving_tests_markdown_table}
|
||||||
|
|
||||||
|
|
||||||
## json version of the benchmarking tables
|
## json version of the benchmarking tables
|
||||||
|
|
||||||
This section contains the data of the markdown tables above in JSON format.
|
This section contains the data of the markdown tables above in JSON format.
|
||||||
You can load the benchmarking tables into pandas dataframes as follows:
|
You can load the benchmarking tables into pandas dataframes as follows:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@ -48,9 +54,9 @@ serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
|
|||||||
```
|
```
|
||||||
|
|
||||||
The json string for all benchmarking tables:
|
The json string for all benchmarking tables:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{benchmarking_results_in_json_string}
|
{benchmarking_results_in_json_string}
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also check the raw experiment data in the Artifact tab of the Buildkite page.
|
You can also check the raw experiment data in the Artifact tab of the Buildkite page.
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,3 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -65,18 +63,18 @@ def read_markdown(file):
|
|||||||
|
|
||||||
|
|
||||||
def results_to_json(latency, throughput, serving):
|
def results_to_json(latency, throughput, serving):
|
||||||
return json.dumps(
|
return json.dumps({
|
||||||
{
|
'latency': latency.to_dict(),
|
||||||
"latency": latency.to_dict(),
|
'throughput': throughput.to_dict(),
|
||||||
"throughput": throughput.to_dict(),
|
'serving': serving.to_dict()
|
||||||
"serving": serving.to_dict(),
|
})
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
# collect results
|
# collect results
|
||||||
for test_file in results_folder.glob("*.json"):
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
|
||||||
with open(test_file) as f:
|
with open(test_file) as f:
|
||||||
raw_result = json.loads(f.read())
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
@ -84,13 +82,8 @@ if __name__ == "__main__":
|
|||||||
# this result is generated via `benchmark_serving.py`
|
# this result is generated via `benchmark_serving.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
command = json.loads(f.read())
|
||||||
command = json.loads(f.read())
|
|
||||||
except OSError as e:
|
|
||||||
print(e)
|
|
||||||
continue
|
|
||||||
|
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
# update the test name of this result
|
# update the test name of this result
|
||||||
@ -104,13 +97,8 @@ if __name__ == "__main__":
|
|||||||
# this result is generated via `benchmark_latency.py`
|
# this result is generated via `benchmark_latency.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
command = json.loads(f.read())
|
||||||
command = json.loads(f.read())
|
|
||||||
except OSError as e:
|
|
||||||
print(e)
|
|
||||||
continue
|
|
||||||
|
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
# update the test name of this result
|
# update the test name of this result
|
||||||
@ -120,8 +108,7 @@ if __name__ == "__main__":
|
|||||||
for perc in [10, 25, 50, 75, 90, 99]:
|
for perc in [10, 25, 50, 75, 90, 99]:
|
||||||
# Multiply 1000 to convert the time unit from s to ms
|
# Multiply 1000 to convert the time unit from s to ms
|
||||||
raw_result.update(
|
raw_result.update(
|
||||||
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
|
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
|
||||||
)
|
|
||||||
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
||||||
|
|
||||||
# add the result to raw_result
|
# add the result to raw_result
|
||||||
@ -132,13 +119,8 @@ if __name__ == "__main__":
|
|||||||
# this result is generated via `benchmark_throughput.py`
|
# this result is generated via `benchmark_throughput.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
command = json.loads(f.read())
|
||||||
command = json.loads(f.read())
|
|
||||||
except OSError as e:
|
|
||||||
print(e)
|
|
||||||
continue
|
|
||||||
|
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
# update the test name of this result
|
# update the test name of this result
|
||||||
@ -154,27 +136,26 @@ if __name__ == "__main__":
|
|||||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||||
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
||||||
|
|
||||||
raw_results_json = results_to_json(
|
raw_results_json = results_to_json(latency_results, throughput_results,
|
||||||
latency_results, throughput_results, serving_results
|
serving_results)
|
||||||
)
|
|
||||||
|
|
||||||
# remapping the key, for visualization purpose
|
# remapping the key, for visualization purpose
|
||||||
if not latency_results.empty:
|
if not latency_results.empty:
|
||||||
latency_results = latency_results[list(latency_column_mapping.keys())].rename(
|
latency_results = latency_results[list(
|
||||||
columns=latency_column_mapping
|
latency_column_mapping.keys())].rename(
|
||||||
)
|
columns=latency_column_mapping)
|
||||||
if not serving_results.empty:
|
if not serving_results.empty:
|
||||||
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
serving_results = serving_results[list(
|
||||||
columns=serving_column_mapping
|
serving_column_mapping.keys())].rename(
|
||||||
)
|
columns=serving_column_mapping)
|
||||||
if not throughput_results.empty:
|
if not throughput_results.empty:
|
||||||
throughput_results = throughput_results[
|
throughput_results = throughput_results[list(
|
||||||
list(throughput_results_column_mapping.keys())
|
throughput_results_column_mapping.keys())].rename(
|
||||||
].rename(columns=throughput_results_column_mapping)
|
columns=throughput_results_column_mapping)
|
||||||
|
|
||||||
processed_results_json = results_to_json(
|
processed_results_json = results_to_json(latency_results,
|
||||||
latency_results, throughput_results, serving_results
|
throughput_results,
|
||||||
)
|
serving_results)
|
||||||
|
|
||||||
for df in [latency_results, serving_results, throughput_results]:
|
for df in [latency_results, serving_results, throughput_results]:
|
||||||
if df.empty:
|
if df.empty:
|
||||||
@ -186,39 +167,38 @@ if __name__ == "__main__":
|
|||||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||||
# we want to turn it into "8xGPUTYPE"
|
# we want to turn it into "8xGPUTYPE"
|
||||||
df["GPU"] = df["GPU"].apply(
|
df["GPU"] = df["GPU"].apply(
|
||||||
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
|
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
|
||||||
)
|
|
||||||
|
|
||||||
# get markdown tables
|
# get markdown tables
|
||||||
latency_md_table = tabulate(
|
latency_md_table = tabulate(latency_results,
|
||||||
latency_results, headers="keys", tablefmt="pipe", showindex=False
|
headers='keys',
|
||||||
)
|
tablefmt='pipe',
|
||||||
serving_md_table = tabulate(
|
showindex=False)
|
||||||
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
serving_md_table = tabulate(serving_results,
|
||||||
)
|
headers='keys',
|
||||||
throughput_md_table = tabulate(
|
tablefmt='pipe',
|
||||||
throughput_results, headers="keys", tablefmt="pipe", showindex=False
|
showindex=False)
|
||||||
)
|
throughput_md_table = tabulate(throughput_results,
|
||||||
|
headers='keys',
|
||||||
|
tablefmt='pipe',
|
||||||
|
showindex=False)
|
||||||
|
|
||||||
# document the result
|
# document the result
|
||||||
with open(results_folder / "benchmark_results.md", "w") as f:
|
with open(results_folder / "benchmark_results.md", "w") as f:
|
||||||
results = read_markdown(
|
|
||||||
"../.buildkite/nightly-benchmarks/"
|
results = read_markdown("../.buildkite/nightly-benchmarks/" +
|
||||||
+ "performance-benchmarks-descriptions.md"
|
"performance-benchmarks-descriptions.md")
|
||||||
)
|
|
||||||
results = results.format(
|
results = results.format(
|
||||||
latency_tests_markdown_table=latency_md_table,
|
latency_tests_markdown_table=latency_md_table,
|
||||||
throughput_tests_markdown_table=throughput_md_table,
|
throughput_tests_markdown_table=throughput_md_table,
|
||||||
serving_tests_markdown_table=serving_md_table,
|
serving_tests_markdown_table=serving_md_table,
|
||||||
benchmarking_results_in_json_string=processed_results_json,
|
benchmarking_results_in_json_string=processed_results_json)
|
||||||
)
|
|
||||||
f.write(results)
|
f.write(results)
|
||||||
|
|
||||||
# document benchmarking results in json
|
# document benchmarking results in json
|
||||||
with open(results_folder / "benchmark_results.json", "w") as f:
|
with open(results_folder / "benchmark_results.json", "w") as f:
|
||||||
results = (
|
|
||||||
latency_results.to_dict(orient="records")
|
results = latency_results.to_dict(
|
||||||
+ throughput_results.to_dict(orient="records")
|
orient='records') + throughput_results.to_dict(
|
||||||
+ serving_results.to_dict(orient="records")
|
orient='records') + serving_results.to_dict(orient='records')
|
||||||
)
|
|
||||||
f.write(json.dumps(results))
|
f.write(json.dumps(results))
|
||||||
|
|||||||
@ -1,5 +1,3 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
@ -14,12 +12,15 @@ def main(model, cachedir):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Download and save Hugging Face tokenizer"
|
description="Download and save Hugging Face tokenizer")
|
||||||
)
|
parser.add_argument("--model",
|
||||||
parser.add_argument("--model", type=str, required=True, help="Name of the model")
|
type=str,
|
||||||
parser.add_argument(
|
required=True,
|
||||||
"--cachedir", type=str, required=True, help="Directory to save the tokenizer"
|
help="Name of the model")
|
||||||
)
|
parser.add_argument("--cachedir",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Directory to save the tokenizer")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args.model, args.cachedir)
|
main(args.model, args.cachedir)
|
||||||
|
|||||||
@ -1,5 +1,3 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -11,33 +9,33 @@ from tabulate import tabulate
|
|||||||
|
|
||||||
def parse_arguments():
|
def parse_arguments():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Parse command line arguments for summary-nightly-results script."
|
description=
|
||||||
)
|
'Parse command line arguments for summary-nightly-results script.')
|
||||||
parser.add_argument(
|
parser.add_argument('--results-folder',
|
||||||
"--results-folder",
|
type=str,
|
||||||
type=str,
|
required=True,
|
||||||
required=True,
|
help='The folder where the results are stored.')
|
||||||
help="The folder where the results are stored.",
|
parser.add_argument('--description',
|
||||||
)
|
type=str,
|
||||||
parser.add_argument(
|
required=True,
|
||||||
"--description", type=str, required=True, help="Description of the results."
|
help='Description of the results.')
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
def get_perf(df, method, model, metric):
|
def get_perf(df, method, model, metric):
|
||||||
|
|
||||||
means = []
|
means = []
|
||||||
|
|
||||||
for qps in [2, 4, 8, 16, "inf"]:
|
for qps in [2, 4, 8, 16, "inf"]:
|
||||||
target = df["Test name"].str.contains(model)
|
target = df['Test name'].str.contains(model)
|
||||||
target = target & df["Engine"].str.contains(method)
|
target = target & df['Engine'].str.contains(method)
|
||||||
target = target & df["Test name"].str.contains("qps_" + str(qps))
|
target = target & df['Test name'].str.contains("qps_" + str(qps))
|
||||||
filtered_df = df[target]
|
filtered_df = df[target]
|
||||||
|
|
||||||
if filtered_df.empty:
|
if filtered_df.empty:
|
||||||
means.append(0.0)
|
means.append(0.)
|
||||||
else:
|
else:
|
||||||
means.append(filtered_df[metric].values[0])
|
means.append(filtered_df[metric].values[0])
|
||||||
|
|
||||||
@ -45,6 +43,7 @@ def get_perf(df, method, model, metric):
|
|||||||
|
|
||||||
|
|
||||||
def get_perf_w_std(df, method, model, metric):
|
def get_perf_w_std(df, method, model, metric):
|
||||||
|
|
||||||
if metric in ["TTFT", "ITL"]:
|
if metric in ["TTFT", "ITL"]:
|
||||||
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
||||||
mean = mean.tolist()
|
mean = mean.tolist()
|
||||||
@ -59,8 +58,7 @@ def get_perf_w_std(df, method, model, metric):
|
|||||||
else:
|
else:
|
||||||
assert metric == "Tput"
|
assert metric == "Tput"
|
||||||
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
||||||
df, method, model, "Output Tput (tok/s)"
|
df, method, model, "Output Tput (tok/s)")
|
||||||
)
|
|
||||||
mean = mean.tolist()
|
mean = mean.tolist()
|
||||||
std = None
|
std = None
|
||||||
|
|
||||||
@ -80,17 +78,18 @@ def main(args):
|
|||||||
# generate markdown table
|
# generate markdown table
|
||||||
df = pd.DataFrame.from_dict(results)
|
df = pd.DataFrame.from_dict(results)
|
||||||
|
|
||||||
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
|
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
||||||
|
|
||||||
with open(args.description) as f:
|
with open(args.description) as f:
|
||||||
description = f.read()
|
description = f.read()
|
||||||
|
|
||||||
description = description.format(nightly_results_benchmarking_table=md_table)
|
description = description.format(
|
||||||
|
nightly_results_benchmarking_table=md_table)
|
||||||
|
|
||||||
with open("nightly_results.md", "w") as f:
|
with open("nightly_results.md", "w") as f:
|
||||||
f.write(description)
|
f.write(description)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == '__main__':
|
||||||
args = parse_arguments()
|
args = parse_arguments()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -1,5 +1,3 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
from lmdeploy.serve.openai.api_client import APIClient
|
from lmdeploy.serve.openai.api_client import APIClient
|
||||||
|
|
||||||
api_client = APIClient("http://localhost:8000")
|
api_client = APIClient("http://localhost:8000")
|
||||||
|
|||||||
@ -426,7 +426,7 @@ main() {
|
|||||||
|
|
||||||
pip install -U transformers
|
pip install -U transformers
|
||||||
|
|
||||||
pip install -r requirements/dev.txt
|
pip install -r requirements-dev.txt
|
||||||
which genai-perf
|
which genai-perf
|
||||||
|
|
||||||
# check storage
|
# check storage
|
||||||
|
|||||||
@ -10,24 +10,15 @@ set -x
|
|||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
check_gpus() {
|
check_gpus() {
|
||||||
if command -v nvidia-smi; then
|
# check the number of GPUs and GPU type.
|
||||||
# check the number of GPUs and GPU type.
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
||||||
elif command -v amd-smi; then
|
|
||||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
echo "GPU found."
|
echo "GPU found."
|
||||||
else
|
else
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
if command -v nvidia-smi; then
|
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
|
||||||
elif command -v amd-smi; then
|
|
||||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
|
||||||
fi
|
|
||||||
echo "GPU type is $gpu_type"
|
echo "GPU type is $gpu_type"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -99,15 +90,9 @@ kill_gpu_processes() {
|
|||||||
|
|
||||||
|
|
||||||
# wait until GPU memory usage smaller than 1GB
|
# wait until GPU memory usage smaller than 1GB
|
||||||
if command -v nvidia-smi; then
|
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
sleep 1
|
||||||
sleep 1
|
done
|
||||||
done
|
|
||||||
elif command -v amd-smi; then
|
|
||||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
# remove vllm config file
|
# remove vllm config file
|
||||||
rm -rf ~/.config/vllm
|
rm -rf ~/.config/vllm
|
||||||
@ -324,14 +309,11 @@ run_serving_tests() {
|
|||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
# pass the tensor parallel size to the client so that it can be displayed
|
|
||||||
# on the benchmark dashboard
|
|
||||||
client_command="python3 benchmark_serving.py \
|
client_command="python3 benchmark_serving.py \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $RESULTS_FOLDER \
|
--result-dir $RESULTS_FOLDER \
|
||||||
--result-filename ${new_test_name}.json \
|
--result-filename ${new_test_name}.json \
|
||||||
--request-rate $qps \
|
--request-rate $qps \
|
||||||
--metadata "tensor_parallel_size=$tp" \
|
|
||||||
$client_args"
|
$client_args"
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
echo "Running test case $test_name with qps $qps"
|
||||||
@ -363,11 +345,6 @@ main() {
|
|||||||
check_gpus
|
check_gpus
|
||||||
check_hf_token
|
check_hf_token
|
||||||
|
|
||||||
# Set to v1 to run v1 benchmark
|
|
||||||
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
|
|
||||||
export VLLM_USE_V1=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# dependencies
|
# dependencies
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
@ -376,7 +353,7 @@ main() {
|
|||||||
# get the current IP address, required by benchmark_serving.py
|
# get the current IP address, required by benchmark_serving.py
|
||||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||||
export VLLM_LOGGING_LEVEL="WARNING"
|
export VLLM_LOG_LEVEL="WARNING"
|
||||||
|
|
||||||
# prepare for benchmarking
|
# prepare for benchmarking
|
||||||
cd benchmarks || exit 1
|
cd benchmarks || exit 1
|
||||||
|
|||||||
@ -1,5 +1,3 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
@ -34,8 +32,10 @@ serving_column_mapping = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
# collect results
|
# collect results
|
||||||
for test_file in results_folder.glob("*.json"):
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
|
||||||
with open(test_file) as f:
|
with open(test_file) as f:
|
||||||
raw_result = json.loads(f.read())
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
@ -54,16 +54,17 @@ if __name__ == "__main__":
|
|||||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||||
|
|
||||||
if not serving_results.empty:
|
if not serving_results.empty:
|
||||||
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
serving_results = serving_results[list(
|
||||||
columns=serving_column_mapping
|
serving_column_mapping.keys())].rename(
|
||||||
)
|
columns=serving_column_mapping)
|
||||||
|
|
||||||
serving_md_table_with_headers = tabulate(
|
serving_md_table_with_headers = tabulate(serving_results,
|
||||||
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
headers='keys',
|
||||||
)
|
tablefmt='pipe',
|
||||||
|
showindex=False)
|
||||||
# remove the first line of header
|
# remove the first line of header
|
||||||
serving_md_table_lines = serving_md_table_with_headers.split("\n")
|
serving_md_table_lines = serving_md_table_with_headers.split('\n')
|
||||||
serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
|
serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
|
||||||
|
|
||||||
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||||
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
||||||
@ -73,9 +74,10 @@ if __name__ == "__main__":
|
|||||||
# document results with header.
|
# document results with header.
|
||||||
# for those who wants to reproduce our benchmark.
|
# for those who wants to reproduce our benchmark.
|
||||||
f.write(serving_md_table_with_headers)
|
f.write(serving_md_table_with_headers)
|
||||||
f.write("\n")
|
f.write('\n')
|
||||||
|
|
||||||
# document benchmarking results in json
|
# document benchmarking results in json
|
||||||
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
||||||
results = serving_results.to_dict(orient="records")
|
|
||||||
|
results = serving_results.to_dict(orient='records')
|
||||||
f.write(json.dumps(results))
|
f.write(json.dumps(results))
|
||||||
|
|||||||
@ -1,10 +1,6 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
|
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
|
||||||
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
|
||||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
|
|
||||||
else
|
|
||||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
|
||||||
fi
|
|
||||||
|
|
||||||
TIMEOUT_SECONDS=10
|
TIMEOUT_SECONDS=10
|
||||||
|
|
||||||
|
|||||||
@ -29,4 +29,4 @@
|
|||||||
"num-iters": 15
|
"num-iters": 15
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@ -63,12 +63,11 @@
|
|||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
"disable_log_requests": "",
|
"disable_log_requests": "",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"speculative_config": {
|
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
|
||||||
"model": "turboderp/Qwama-0.5B-Instruct",
|
"num_speculative_tokens": 4,
|
||||||
"num_speculative_tokens": 4,
|
"speculative_draft_tensor_parallel_size": 1,
|
||||||
"draft_tensor_parallel_size": 1
|
"use_v2_block_manager": ""
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
|||||||
@ -32,4 +32,4 @@
|
|||||||
"backend": "vllm"
|
"backend": "vllm"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@ -1,46 +0,0 @@
|
|||||||
# This local pyproject file is part of the migration from yapf to ruff format.
|
|
||||||
# It uses the same core rules as the main pyproject.toml file, but with the
|
|
||||||
# following differences:
|
|
||||||
# - ruff line length is overridden to 88
|
|
||||||
# - deprecated typing ignores (UP006, UP035) have been removed
|
|
||||||
|
|
||||||
[tool.ruff]
|
|
||||||
line-length = 88
|
|
||||||
|
|
||||||
[tool.ruff.lint.per-file-ignores]
|
|
||||||
"vllm/third_party/**" = ["ALL"]
|
|
||||||
"vllm/version.py" = ["F401"]
|
|
||||||
"vllm/_version.py" = ["ALL"]
|
|
||||||
|
|
||||||
[tool.ruff.lint]
|
|
||||||
select = [
|
|
||||||
# pycodestyle
|
|
||||||
"E",
|
|
||||||
# Pyflakes
|
|
||||||
"F",
|
|
||||||
# pyupgrade
|
|
||||||
"UP",
|
|
||||||
# flake8-bugbear
|
|
||||||
"B",
|
|
||||||
# flake8-simplify
|
|
||||||
"SIM",
|
|
||||||
# isort
|
|
||||||
"I",
|
|
||||||
# flake8-logging-format
|
|
||||||
"G",
|
|
||||||
]
|
|
||||||
ignore = [
|
|
||||||
# star imports
|
|
||||||
"F405", "F403",
|
|
||||||
# lambda expression assignment
|
|
||||||
"E731",
|
|
||||||
# Loop control variable not used within loop body
|
|
||||||
"B007",
|
|
||||||
# f-string format
|
|
||||||
"UP032",
|
|
||||||
# Can remove once 3.10+ is the minimum Python version
|
|
||||||
"UP007",
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.ruff.format]
|
|
||||||
docstring-code-format = true
|
|
||||||
@ -1,23 +1,12 @@
|
|||||||
steps:
|
steps:
|
||||||
- label: "Build wheel - CUDA 12.8"
|
- label: "Build wheel - CUDA 12.1"
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/upload-wheels.sh"
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.6"
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
@ -31,10 +20,10 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
@ -48,7 +37,7 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
- label: "Build and publish TPU release image"
|
- label: "Build and publish TPU release image"
|
||||||
@ -57,23 +46,16 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: tpu_queue_postmerge
|
queue: tpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "yes | docker system prune -a"
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
|
||||||
- "git fetch --all"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
|
||||||
- "docker push vllm/vllm-tpu:nightly"
|
- "docker push vllm/vllm-tpu:nightly"
|
||||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||||
plugins:
|
plugins:
|
||||||
- docker-login#v3.0.0:
|
- docker-login#v3.0.0:
|
||||||
username: vllmbot
|
username: vllm
|
||||||
password-env: DOCKERHUB_TOKEN
|
password-env: DOCKERHUB_TOKEN
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- input: "Provide Release version here"
|
|
||||||
fields:
|
|
||||||
- text: "What is the release version?"
|
|
||||||
key: "release-version"
|
|
||||||
|
|
||||||
- block: "Build CPU release image"
|
- block: "Build CPU release image"
|
||||||
key: block-cpu-release-image-build
|
key: block-cpu-release-image-build
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
@ -84,22 +66,7 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- block: "Build Neuron release image"
|
|
||||||
key: block-neuron-release-image-build
|
|
||||||
depends_on: ~
|
|
||||||
|
|
||||||
- label: "Build and publish Neuron release image"
|
|
||||||
depends_on: block-neuron-release-image-build
|
|
||||||
agents:
|
|
||||||
queue: neuron-postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
|
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|||||||
156
.buildkite/run-amd-test.sh
Executable file
156
.buildkite/run-amd-test.sh
Executable file
@ -0,0 +1,156 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script runs test inside the corresponding ROCm docker container.
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
# Print ROCm version
|
||||||
|
echo "--- Confirming Clean Initial State"
|
||||||
|
while true; do
|
||||||
|
sleep 3
|
||||||
|
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||||
|
echo "GPUs state is \"clean\""
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "--- ROCm info"
|
||||||
|
rocminfo
|
||||||
|
|
||||||
|
# cleanup older docker images
|
||||||
|
cleanup_docker() {
|
||||||
|
# Get Docker's root directory
|
||||||
|
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||||
|
if [ -z "$docker_root" ]; then
|
||||||
|
echo "Failed to determine Docker root directory."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Docker root directory: $docker_root"
|
||||||
|
# Check disk usage of the filesystem where Docker's root directory is located
|
||||||
|
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||||
|
# Define the threshold
|
||||||
|
threshold=70
|
||||||
|
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||||
|
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||||
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
|
docker image prune -f
|
||||||
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
|
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||||
|
echo "Docker images and volumes cleanup completed."
|
||||||
|
else
|
||||||
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Call the cleanup docker function
|
||||||
|
cleanup_docker
|
||||||
|
|
||||||
|
echo "--- Resetting GPUs"
|
||||||
|
|
||||||
|
echo "reset" > /opt/amdgpu/etc/gpu_state
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
sleep 3
|
||||||
|
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||||
|
echo "GPUs state is \"clean\""
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "--- Pulling container"
|
||||||
|
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||||
|
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
|
docker pull "${image_name}"
|
||||||
|
|
||||||
|
remove_docker_container() {
|
||||||
|
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
||||||
|
}
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
echo "--- Running container"
|
||||||
|
|
||||||
|
HF_CACHE="$(realpath ~)/huggingface"
|
||||||
|
mkdir -p "${HF_CACHE}"
|
||||||
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
|
|
||||||
|
commands=$@
|
||||||
|
echo "Commands:$commands"
|
||||||
|
#ignore certain kernels tests
|
||||||
|
if [[ $commands == *" kernels "* ]]; then
|
||||||
|
commands="${commands} \
|
||||||
|
--ignore=kernels/test_attention.py \
|
||||||
|
--ignore=kernels/test_attention_selector.py \
|
||||||
|
--ignore=kernels/test_blocksparse_attention.py \
|
||||||
|
--ignore=kernels/test_causal_conv1d.py \
|
||||||
|
--ignore=kernels/test_cutlass.py \
|
||||||
|
--ignore=kernels/test_encoder_decoder_attn.py \
|
||||||
|
--ignore=kernels/test_flash_attn.py \
|
||||||
|
--ignore=kernels/test_flashinfer.py \
|
||||||
|
--ignore=kernels/test_int8_quant.py \
|
||||||
|
--ignore=kernels/test_machete_gemm.py \
|
||||||
|
--ignore=kernels/test_mamba_ssm.py \
|
||||||
|
--ignore=kernels/test_marlin_gemm.py \
|
||||||
|
--ignore=kernels/test_moe.py \
|
||||||
|
--ignore=kernels/test_prefix_prefill.py \
|
||||||
|
--ignore=kernels/test_rand.py \
|
||||||
|
--ignore=kernels/test_sampler.py"
|
||||||
|
fi
|
||||||
|
|
||||||
|
#ignore certain Entrypoints tests
|
||||||
|
if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||||
|
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
||||||
|
--ignore=entrypoints/openai/test_accuracy.py \
|
||||||
|
--ignore=entrypoints/openai/test_audio.py \
|
||||||
|
--ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||||
|
--ignore=entrypoints/openai/test_embedding.py \
|
||||||
|
--ignore=entrypoints/openai/test_oot_registration.py "}
|
||||||
|
fi
|
||||||
|
|
||||||
|
PARALLEL_JOB_COUNT=8
|
||||||
|
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||||
|
if [[ $commands == *"--shard-id="* ]]; then
|
||||||
|
# assign job count as the number of shards used
|
||||||
|
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
||||||
|
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||||
|
# assign shard-id for each shard
|
||||||
|
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||||
|
echo "Shard ${GPU} commands:$commands_gpu"
|
||||||
|
docker run \
|
||||||
|
--device /dev/kfd --device /dev/dri \
|
||||||
|
--network host \
|
||||||
|
--shm-size=16gb \
|
||||||
|
--rm \
|
||||||
|
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||||
|
-e HF_TOKEN \
|
||||||
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
|
--name "${container_name}_${GPU}" \
|
||||||
|
"${image_name}" \
|
||||||
|
/bin/bash -c "${commands_gpu}" \
|
||||||
|
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
|
||||||
|
PIDS+=($!)
|
||||||
|
done
|
||||||
|
#wait for all processes to finish and collect exit codes
|
||||||
|
for pid in "${PIDS[@]}"; do
|
||||||
|
wait "${pid}"
|
||||||
|
STATUS+=($?)
|
||||||
|
done
|
||||||
|
for st in "${STATUS[@]}"; do
|
||||||
|
if [[ ${st} -ne 0 ]]; then
|
||||||
|
echo "One of the processes failed with $st"
|
||||||
|
exit "${st}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
else
|
||||||
|
docker run \
|
||||||
|
--device /dev/kfd --device /dev/dri \
|
||||||
|
--network host \
|
||||||
|
--shm-size=16gb \
|
||||||
|
--rm \
|
||||||
|
-e HIP_VISIBLE_DEVICES=0 \
|
||||||
|
-e HF_TOKEN \
|
||||||
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
|
--name "${container_name}" \
|
||||||
|
"${image_name}" \
|
||||||
|
/bin/bash -c "${commands}"
|
||||||
|
fi
|
||||||
@ -5,8 +5,8 @@
|
|||||||
set -ex
|
set -ex
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
# cd 2 levels into the working directory
|
# cd into parent directory of this file
|
||||||
cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
||||||
|
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
|
||||||
@ -10,4 +10,5 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t cpu-test -f docker/Dockerfile.s390x .
|
docker build -t cpu-test -f Dockerfile.ppc64le .
|
||||||
|
|
||||||
@ -8,40 +8,34 @@ set -ex
|
|||||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||||
NUMA_NODE=${NUMA_NODE:-1}
|
NUMA_NODE=${NUMA_NODE:-1}
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
|
||||||
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
|
||||||
set -e;
|
|
||||||
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
|
|
||||||
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
||||||
|
|
||||||
function cpu_tests() {
|
function cpu_tests() {
|
||||||
set -e
|
set -e
|
||||||
export NUMA_NODE=$2
|
export NUMA_NODE=$2
|
||||||
export BUILDKITE_BUILD_NUMBER=$3
|
|
||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/offline_inference/basic.py"
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -v -s tests/kernels/test_cache.py -m cpu_model
|
pip install -r vllm/requirements-test.txt
|
||||||
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
|
|
||||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||||
pytest -v -s tests/models/embedding/language -m cpu_model
|
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||||
@ -91,4 +85,4 @@ function cpu_tests() {
|
|||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
|
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||||
@ -9,13 +9,11 @@ python3 use_existing_torch.py
|
|||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
DOCKER_BUILDKIT=1 docker build . \
|
DOCKER_BUILDKIT=1 docker build . \
|
||||||
--file docker/Dockerfile \
|
|
||||||
--target vllm-openai \
|
--target vllm-openai \
|
||||||
--platform "linux/arm64" \
|
--platform "linux/arm64" \
|
||||||
-t gh200-test \
|
-t gh200-test \
|
||||||
--build-arg max_jobs=66 \
|
--build-arg max_jobs=66 \
|
||||||
--build-arg nvcc_threads=2 \
|
--build-arg nvcc_threads=2 \
|
||||||
--build-arg RUN_WHEEL_CHECK=false \
|
|
||||||
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
||||||
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
||||||
|
|
||||||
@ -25,6 +23,6 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and test offline inference
|
# Run the image and test offline inference
|
||||||
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||||
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
python3 examples/offline_inference/basic.py
|
||||||
'
|
'
|
||||||
24
.buildkite/run-hpu-test.sh
Normal file
24
.buildkite/run-hpu-test.sh
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t hpu-test-env -f Dockerfile.hpu .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
# certain versions of HPU software stack have a bug that can
|
||||||
|
# override the exit code of the script, so we need to use
|
||||||
|
# separate remove_docker_container and remove_docker_container_and_exit
|
||||||
|
# functions, while other platforms only need one remove_docker_container
|
||||||
|
# function.
|
||||||
|
EXITCODE=1
|
||||||
|
remove_docker_container() { docker rm -f hpu-test || true; }
|
||||||
|
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
|
||||||
|
trap remove_docker_container_and_exit EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image and launch offline inference
|
||||||
|
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
|
||||||
|
EXITCODE=$?
|
||||||
@ -3,7 +3,7 @@
|
|||||||
set -euox pipefail
|
set -euox pipefail
|
||||||
|
|
||||||
if [[ $# -lt 4 ]]; then
|
if [[ $# -lt 4 ]]; then
|
||||||
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -11,14 +11,13 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|||||||
HF_CACHE="$(realpath ~)/huggingface"
|
HF_CACHE="$(realpath ~)/huggingface"
|
||||||
mkdir -p "${HF_CACHE}"
|
mkdir -p "${HF_CACHE}"
|
||||||
HF_MOUNT="/root/.cache/huggingface"
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
|
|
||||||
|
|
||||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
|
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||||
|
|
||||||
# prune old image and containers to save disk space, and only once a day
|
# prune old image and containers to save disk space, and only once a day
|
||||||
# by using a timestamp file in tmp.
|
# by using a timestamp file in tmp.
|
||||||
@ -30,13 +29,16 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
|||||||
docker image prune -f
|
docker image prune -f
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
docker volume prune -f && docker system prune -f
|
docker volume prune -f && docker system prune -f
|
||||||
|
# Remove huggingface model artifacts and compiler cache
|
||||||
|
rm -rf "${HF_MOUNT:?}/*"
|
||||||
|
rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
|
||||||
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||||
fi
|
fi
|
||||||
|
|
||||||
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
|
docker build -t "${image_name}" -f Dockerfile.neuron .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
@ -45,19 +47,11 @@ remove_docker_container() {
|
|||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
# Run the image
|
# Run the image
|
||||||
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
|
||||||
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
${image_name} \
|
${image_name} \
|
||||||
/bin/bash -c "
|
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
|
||||||
python3 /workspace/vllm/examples/offline_inference/neuron.py;
|
|
||||||
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
|
|
||||||
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
|
|
||||||
echo 'Running test file: '$f;
|
|
||||||
python3 -m pytest \$f -v --capture=tee-sys;
|
|
||||||
done
|
|
||||||
"
|
|
||||||
16
.buildkite/run-openvino-test.sh
Executable file
16
.buildkite/run-openvino-test.sh
Executable file
@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the OpenVINO docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t openvino-test -f Dockerfile.openvino .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f openvino-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image and launch offline inference
|
||||||
|
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
|
||||||
26
.buildkite/run-tpu-test.sh
Executable file
26
.buildkite/run-tpu-test.sh
Executable file
@ -0,0 +1,26 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Build the docker image.
|
||||||
|
docker build -f Dockerfile.tpu -t vllm-tpu .
|
||||||
|
|
||||||
|
# Set up cleanup.
|
||||||
|
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# For HF_TOKEN.
|
||||||
|
source /etc/environment
|
||||||
|
# Run a simple end-to-end example.
|
||||||
|
docker run --privileged --net host --shm-size=16G -it \
|
||||||
|
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||||
|
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||||
|
&& python3 -m pip install pytest \
|
||||||
|
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||||
|
&& pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
|
||||||
|
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||||
|
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||||
|
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||||
|
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||||
19
.buildkite/run-xpu-test.sh
Normal file
19
.buildkite/run-xpu-test.sh
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t xpu-test -f Dockerfile.xpu .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f xpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image and test offline inference/tensor parallel
|
||||||
|
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
|
||||||
|
python3 examples/offline_inference/basic.py
|
||||||
|
python3 examples/offline_inference/cli.py -tp 2
|
||||||
|
'
|
||||||
@ -1,241 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script runs test inside the corresponding ROCm docker container.
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
# Export Python path
|
|
||||||
export PYTHONPATH=".."
|
|
||||||
|
|
||||||
# Print ROCm version
|
|
||||||
echo "--- Confirming Clean Initial State"
|
|
||||||
while true; do
|
|
||||||
sleep 3
|
|
||||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
|
||||||
echo "GPUs state is \"clean\""
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "--- ROCm info"
|
|
||||||
rocminfo
|
|
||||||
|
|
||||||
# cleanup older docker images
|
|
||||||
cleanup_docker() {
|
|
||||||
# Get Docker's root directory
|
|
||||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
|
||||||
if [ -z "$docker_root" ]; then
|
|
||||||
echo "Failed to determine Docker root directory."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Docker root directory: $docker_root"
|
|
||||||
# Check disk usage of the filesystem where Docker's root directory is located
|
|
||||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
||||||
# Define the threshold
|
|
||||||
threshold=70
|
|
||||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
|
||||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
|
||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
|
||||||
docker image prune -f
|
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
|
||||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
|
||||||
echo "Docker images and volumes cleanup completed."
|
|
||||||
else
|
|
||||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Call the cleanup docker function
|
|
||||||
cleanup_docker
|
|
||||||
|
|
||||||
echo "--- Resetting GPUs"
|
|
||||||
|
|
||||||
echo "reset" > /opt/amdgpu/etc/gpu_state
|
|
||||||
|
|
||||||
while true; do
|
|
||||||
sleep 3
|
|
||||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
|
||||||
echo "GPUs state is \"clean\""
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "--- Pulling container"
|
|
||||||
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
|
||||||
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|
||||||
docker pull "${image_name}"
|
|
||||||
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
echo "--- Running container"
|
|
||||||
|
|
||||||
HF_CACHE="$(realpath ~)/huggingface"
|
|
||||||
mkdir -p "${HF_CACHE}"
|
|
||||||
HF_MOUNT="/root/.cache/huggingface"
|
|
||||||
|
|
||||||
commands=$@
|
|
||||||
echo "Commands:$commands"
|
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
|
|
||||||
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
|
||||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
|
|
||||||
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
|
||||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
#ignore certain kernels tests
|
|
||||||
if [[ $commands == *" kernels/core"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
|
||||||
--ignore=kernels/core/test_permute_cols.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/attention"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/attention/stest_attention_selector.py \
|
|
||||||
--ignore=kernels/attention/test_blocksparse_attention.py \
|
|
||||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
|
||||||
--ignore=kernels/attention/test_attention_selector.py \
|
|
||||||
--ignore=kernels/attention/test_flash_attn.py \
|
|
||||||
--ignore=kernels/attention/test_flashinfer.py \
|
|
||||||
--ignore=kernels/attention/test_prefix_prefill.py \
|
|
||||||
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
|
||||||
--ignore=kernels/attention/test_mha_attn.py \
|
|
||||||
--ignore=kernels/attention/test_lightning_attn.py \
|
|
||||||
--ignore=kernels/attention/test_attention.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/quantization"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/quantization/test_int8_quant.py \
|
|
||||||
--ignore=kernels/quantization/test_aqlm.py \
|
|
||||||
--ignore=kernels/quantization/test_machete_mm.py \
|
|
||||||
--ignore=kernels/quantization/test_block_fp8.py \
|
|
||||||
--ignore=kernels/quantization/test_block_int8.py \
|
|
||||||
--ignore=kernels/quantization/test_marlin_gemm.py \
|
|
||||||
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
|
||||||
--ignore=kernels/quantization/test_int8_kernel.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/mamba"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
|
||||||
--ignore=kernels/mamba/test_causal_conv1d.py \
|
|
||||||
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/moe"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/moe/test_moe.py \
|
|
||||||
--ignore=kernels/moe/test_cutlass_moe.py \
|
|
||||||
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
#ignore certain Entrypoints/openai tests
|
|
||||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
|
||||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
|
||||||
--ignore=entrypoints/openai/test_audio.py \
|
|
||||||
--ignore=entrypoints/openai/test_shutdown.py \
|
|
||||||
--ignore=entrypoints/openai/test_completion.py \
|
|
||||||
--ignore=entrypoints/openai/test_sleep.py \
|
|
||||||
--ignore=entrypoints/openai/test_models.py \
|
|
||||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
|
||||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
|
||||||
--ignore=entrypoints/openai/test_root_path.py \
|
|
||||||
--ignore=entrypoints/openai/test_tokenization.py \
|
|
||||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
|
||||||
fi
|
|
||||||
|
|
||||||
#ignore certain Entrypoints/llm tests
|
|
||||||
if [[ $commands == *" entrypoints/llm "* ]]; then
|
|
||||||
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
|
|
||||||
--ignore=entrypoints/llm/test_chat.py \
|
|
||||||
--ignore=entrypoints/llm/test_accuracy.py \
|
|
||||||
--ignore=entrypoints/llm/test_init.py \
|
|
||||||
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
|
|
||||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
|
||||||
fi
|
|
||||||
|
|
||||||
#Obsolete currently
|
|
||||||
##ignore certain Entrypoints/llm tests
|
|
||||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
|
||||||
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
|
||||||
#fi
|
|
||||||
|
|
||||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
|
||||||
# --ignore=entrypoints/openai/test_embedding.py \
|
|
||||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
|
||||||
# --ignore=entrypoints/openai/test_accuracy.py \
|
|
||||||
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
|
|
||||||
|
|
||||||
|
|
||||||
PARALLEL_JOB_COUNT=8
|
|
||||||
MYPYTHONPATH=".."
|
|
||||||
|
|
||||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
|
||||||
if [[ $commands == *"--shard-id="* ]]; then
|
|
||||||
# assign job count as the number of shards used
|
|
||||||
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
|
||||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
|
||||||
# assign shard-id for each shard
|
|
||||||
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
|
||||||
echo "Shard ${GPU} commands:$commands_gpu"
|
|
||||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
|
||||||
docker run \
|
|
||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
|
||||||
--network=host \
|
|
||||||
--shm-size=16gb \
|
|
||||||
--rm \
|
|
||||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
|
||||||
-e HF_TOKEN \
|
|
||||||
-e AWS_ACCESS_KEY_ID \
|
|
||||||
-e AWS_SECRET_ACCESS_KEY \
|
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
|
||||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
|
||||||
--name "${container_name}_${GPU}" \
|
|
||||||
"${image_name}" \
|
|
||||||
/bin/bash -c "${commands_gpu}" \
|
|
||||||
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
|
|
||||||
PIDS+=($!)
|
|
||||||
done
|
|
||||||
#wait for all processes to finish and collect exit codes
|
|
||||||
for pid in "${PIDS[@]}"; do
|
|
||||||
wait "${pid}"
|
|
||||||
STATUS+=($?)
|
|
||||||
done
|
|
||||||
for st in "${STATUS[@]}"; do
|
|
||||||
if [[ ${st} -ne 0 ]]; then
|
|
||||||
echo "One of the processes failed with $st"
|
|
||||||
exit "${st}"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
else
|
|
||||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
|
||||||
docker run \
|
|
||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
|
||||||
--network=host \
|
|
||||||
--shm-size=16gb \
|
|
||||||
--rm \
|
|
||||||
-e HIP_VISIBLE_DEVICES=0 \
|
|
||||||
-e HF_TOKEN \
|
|
||||||
-e AWS_ACCESS_KEY_ID \
|
|
||||||
-e AWS_SECRET_ACCESS_KEY \
|
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
|
||||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
|
||||||
--name "${container_name}" \
|
|
||||||
"${image_name}" \
|
|
||||||
/bin/bash -c "${commands}"
|
|
||||||
fi
|
|
||||||
@ -1,48 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
if [[ -n "$container_id" ]]; then
|
|
||||||
podman rm -f "$container_id" || true
|
|
||||||
fi
|
|
||||||
podman system prune -f
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
|
|
||||||
|
|
||||||
# Run the image
|
|
||||||
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
|
|
||||||
|
|
||||||
function cpu_tests() {
|
|
||||||
|
|
||||||
# offline inference
|
|
||||||
podman exec -it "$container_id" bash -c "
|
|
||||||
set -e
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
|
||||||
|
|
||||||
# Run basic model test
|
|
||||||
podman exec -it "$container_id" bash -c "
|
|
||||||
set -e
|
|
||||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
|
||||||
pip install sentence-transformers datamodel_code_generator
|
|
||||||
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
|
||||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
|
||||||
pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
|
|
||||||
}
|
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
|
||||||
|
|
||||||
export container_id
|
|
||||||
export -f cpu_tests
|
|
||||||
timeout 40m bash -c cpu_tests
|
|
||||||
|
|
||||||
@ -1,26 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
docker build -t hpu-test-env -f docker/Dockerfile.hpu .
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
# certain versions of HPU software stack have a bug that can
|
|
||||||
# override the exit code of the script, so we need to use
|
|
||||||
# separate remove_docker_containers and remove_docker_containers_and_exit
|
|
||||||
# functions, while other platforms only need one remove_docker_container
|
|
||||||
# function.
|
|
||||||
EXITCODE=1
|
|
||||||
remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; }
|
|
||||||
remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; }
|
|
||||||
trap remove_docker_containers_and_exit EXIT
|
|
||||||
remove_docker_containers
|
|
||||||
|
|
||||||
# Run the image and launch offline inference
|
|
||||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
|
||||||
docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2
|
|
||||||
|
|
||||||
EXITCODE=$?
|
|
||||||
@ -1,103 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -xu
|
|
||||||
|
|
||||||
# Build the docker image.
|
|
||||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
|
||||||
|
|
||||||
# Set up cleanup.
|
|
||||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# For HF_TOKEN.
|
|
||||||
source /etc/environment
|
|
||||||
# Run a simple end-to-end example.
|
|
||||||
docker run --privileged --net host --shm-size=16G -it \
|
|
||||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
|
||||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
|
||||||
&& python3 -m pip install pytest pytest-asyncio tpu-info \
|
|
||||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
|
||||||
&& export VLLM_XLA_CACHE_PATH= \
|
|
||||||
&& export VLLM_USE_V1=1 \
|
|
||||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
|
||||||
&& echo HARDWARE \
|
|
||||||
&& tpu-info \
|
|
||||||
&& { \
|
|
||||||
echo TEST_0: Running test_perf.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
|
|
||||||
echo TEST_0_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_1: Running test_compilation.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
|
|
||||||
echo TEST_1_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_2: Running test_basic.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
|
|
||||||
echo TEST_2_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
|
||||||
echo TEST_3_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_4: Running test_quantization_accuracy.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
|
|
||||||
echo TEST_4_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_5: Running examples/offline_inference/tpu.py; \
|
|
||||||
python3 /workspace/vllm/examples/offline_inference/tpu.py; \
|
|
||||||
echo TEST_5_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_6: Running test_tpu_model_runner.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
|
|
||||||
echo TEST_6_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_7: Running test_sampler.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
|
|
||||||
echo TEST_7_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_8: Running test_topk_topp_sampler.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
|
|
||||||
echo TEST_8_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_9: Running test_multimodal.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
|
|
||||||
echo TEST_9_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_10: Running test_pallas.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
|
|
||||||
echo TEST_10_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_11: Running test_struct_output_generate.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
|
|
||||||
echo TEST_11_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_12: Running test_moe_pallas.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
|
|
||||||
echo TEST_12_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
# Disable the TPU LoRA tests until the feature is activated
|
|
||||||
# & { \
|
|
||||||
# echo TEST_13: Running test_moe_pallas.py; \
|
|
||||||
# python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
|
|
||||||
# echo TEST_13_EXIT_CODE: \$?; \
|
|
||||||
# } & \
|
|
||||||
wait \
|
|
||||||
&& echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
|
|
||||||
"
|
|
||||||
|
|
||||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
|
||||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
|
||||||
@ -1,31 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
|
|
||||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f "${container_name}" || true;
|
|
||||||
docker image rm -f "${image_name}" || true;
|
|
||||||
docker system prune -f || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Run the image and test offline inference/tensor parallel
|
|
||||||
docker run \
|
|
||||||
--device /dev/dri \
|
|
||||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
|
||||||
--entrypoint="" \
|
|
||||||
--name "${container_name}" \
|
|
||||||
"${image_name}" \
|
|
||||||
sh -c '
|
|
||||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
|
||||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
|
||||||
'
|
|
||||||
@ -2,13 +2,12 @@
|
|||||||
# adding a new command to an existing step. See different options here for examples.
|
# adding a new command to an existing step. See different options here for examples.
|
||||||
|
|
||||||
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
||||||
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
||||||
# to generate the final pipeline yaml file.
|
# to generate the final pipeline yaml file.
|
||||||
|
|
||||||
# Documentation
|
# Documentation
|
||||||
# label(str): the name of the test. emoji allowed.
|
# label(str): the name of the test. emoji allowed.
|
||||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||||
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
|
|
||||||
# fast_check_only(bool): run this test on fastcheck pipeline only
|
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
||||||
# command(str): the single command to run for tests. incompatible with commands.
|
# command(str): the single command to run for tests. incompatible with commands.
|
||||||
@ -16,7 +15,7 @@
|
|||||||
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
||||||
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
|
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
|
||||||
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
|
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
|
||||||
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
|
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
|
||||||
# in this case, commands must be specified. the first command runs on first host, the second
|
# in this case, commands must be specified. the first command runs on first host, the second
|
||||||
# command runs on the second host.
|
# command runs on the second host.
|
||||||
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
|
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
|
||||||
@ -25,24 +24,24 @@
|
|||||||
# When adding a test
|
# When adding a test
|
||||||
# - If the test belong to an existing group, add it there
|
# - If the test belong to an existing group, add it there
|
||||||
# - If the test is short, add to any existing step
|
# - If the test is short, add to any existing step
|
||||||
# - If the test takes more than 10min, then it is okay to create a new step.
|
# - If the test takes more than 10min, then it is okay to create a new step.
|
||||||
# Note that all steps execute in parallel.
|
# Note that all steps execute in parallel.
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
##### fast check tests #####
|
##### fast check tests #####
|
||||||
|
|
||||||
- label: Documentation Build # 2min
|
- label: Documentation Build # 2min
|
||||||
mirror_hardwares: [amdexperimental]
|
working_dir: "/vllm-workspace/test_docs/docs"
|
||||||
working_dir: "/vllm-workspace/test_docs"
|
|
||||||
fast_check: true
|
fast_check: true
|
||||||
no_gpu: True
|
no_gpu: True
|
||||||
commands:
|
commands:
|
||||||
- pip install -r ../requirements/docs.txt
|
- pip install -r requirements-docs.txt
|
||||||
# TODO: add `--strict` once warnings in docstrings are fixed
|
- SPHINXOPTS=\"-W\" make html
|
||||||
- mkdocs build
|
# Check API reference (if it fails, you may have missing mock imports)
|
||||||
|
- grep \"sig sig-object py\" build/html/api/inference_params.html
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||||
mirror_hardwares: [amdexperimental]
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/mq_llm_engine
|
- tests/mq_llm_engine
|
||||||
@ -51,20 +50,18 @@ steps:
|
|||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
- tests/test_utils
|
- tests/test_utils
|
||||||
- tests/worker
|
- tests/worker
|
||||||
- tests/standalone_tests/lazy_imports.py
|
- tests/standalone_tests/lazy_torch_compile.py
|
||||||
commands:
|
commands:
|
||||||
- python3 standalone_tests/lazy_imports.py
|
- python3 standalone_tests/lazy_torch_compile.py
|
||||||
- pytest -v -s mq_llm_engine # MQLLMEngine
|
- pytest -v -s mq_llm_engine # MQLLMEngine
|
||||||
- pytest -v -s async_engine # AsyncLLMEngine
|
- pytest -v -s async_engine # AsyncLLMEngine
|
||||||
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s test_outputs.py
|
|
||||||
- pytest -v -s multimodal
|
- pytest -v -s multimodal
|
||||||
- pytest -v -s test_utils.py # Utils
|
- pytest -v -s test_utils.py # Utils
|
||||||
- pytest -v -s worker # Worker
|
- pytest -v -s worker # Worker
|
||||||
|
|
||||||
- label: Python-only Installation Test
|
- label: Python-only Installation Test
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- tests/standalone_tests/python_only_compile.sh
|
- tests/standalone_tests/python_only_compile.sh
|
||||||
- setup.py
|
- setup.py
|
||||||
@ -72,9 +69,8 @@ steps:
|
|||||||
- bash standalone_tests/python_only_compile.sh
|
- bash standalone_tests/python_only_compile.sh
|
||||||
|
|
||||||
- label: Basic Correctness Test # 30min
|
- label: Basic Correctness Test # 30min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
#mirror_hardwares: [amd]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/basic_correctness/test_basic_correctness
|
- tests/basic_correctness/test_basic_correctness
|
||||||
@ -82,14 +78,12 @@ steps:
|
|||||||
- tests/basic_correctness/test_preemption
|
- tests/basic_correctness/test_preemption
|
||||||
- tests/basic_correctness/test_cumem.py
|
- tests/basic_correctness/test_cumem.py
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s basic_correctness/test_cumem.py
|
- pytest -v -s basic_correctness/test_cumem.py
|
||||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||||
|
|
||||||
- label: Chunked Prefill Test
|
- label: Chunked Prefill Test
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/basic_correctness/test_chunked_prefill
|
- tests/basic_correctness/test_chunked_prefill
|
||||||
@ -98,7 +92,7 @@ steps:
|
|||||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
|
|
||||||
- label: Core Test # 10min
|
- label: Core Test # 10min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amd]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/core
|
- vllm/core
|
||||||
@ -108,80 +102,62 @@ steps:
|
|||||||
- pytest -v -s core
|
- pytest -v -s core
|
||||||
|
|
||||||
- label: Entrypoints Test # 40min
|
- label: Entrypoints Test # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
fast_check: true
|
fast_check: true
|
||||||
torch_nightly: true
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/entrypoints/llm
|
|
||||||
- tests/entrypoints/openai
|
|
||||||
- tests/entrypoints/test_chat_utils
|
|
||||||
- tests/entrypoints/offline_mode
|
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs) # 10min
|
- label: Distributed Tests (4 GPUs) # 10min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/distributed/
|
- vllm/distributed/
|
||||||
- vllm/core/
|
- vllm/core/
|
||||||
- tests/distributed/test_utils
|
- tests/distributed
|
||||||
- tests/distributed/test_pynccl
|
|
||||||
- tests/distributed/test_events
|
|
||||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||||
- tests/compile/test_basic_correctness
|
- tests/compile
|
||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
|
||||||
- tests/v1/test_async_llm_dp.py
|
|
||||||
commands:
|
commands:
|
||||||
# test with tp=2 and external_dp=2
|
|
||||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
||||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
||||||
# test with tp=2 and pp=2
|
|
||||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
||||||
# test with internal dp
|
|
||||||
- python3 ../examples/offline_inference/data_parallel.py
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s distributed/test_events.py
|
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||||
# TODO: create a dedicated test section for multi-GPU example tests
|
# TODO: create a dedicated test section for multi-GPU example tests
|
||||||
# when we have multiple distributed example tests
|
# when we have multiple distributed example tests
|
||||||
- pushd ../examples/offline_inference
|
- python3 ../examples/offline_inference/rlhf.py
|
||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
|
||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
|
||||||
- popd
|
|
||||||
|
|
||||||
- label: Metrics, Tracing Test # 10min
|
- label: Metrics, Tracing Test # 10min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
num_gpus: 2
|
||||||
num_gpus: 2
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/metrics
|
- tests/metrics
|
||||||
- tests/tracing
|
- tests/tracing
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s metrics
|
- pytest -v -s metrics
|
||||||
|
- "pip install \
|
||||||
|
'opentelemetry-sdk>=1.26.0,<1.27.0' \
|
||||||
|
'opentelemetry-api>=1.26.0,<1.27.0' \
|
||||||
|
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
|
||||||
|
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
|
||||||
- pytest -v -s tracing
|
- pytest -v -s tracing
|
||||||
|
|
||||||
##### fast check tests #####
|
##### fast check tests #####
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
|
|
||||||
- label: Regression Test # 5min
|
- label: Regression Test # 5min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/test_regression
|
- tests/test_regression
|
||||||
@ -191,72 +167,57 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/tests" # optional
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
- label: Engine Test # 10min
|
- label: Engine Test # 10min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/engine
|
- tests/engine
|
||||||
- tests/tokenization
|
- tests/tokenization
|
||||||
- tests/test_sequence
|
|
||||||
- tests/test_config
|
|
||||||
- tests/test_logger
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
||||||
# OOM in the CI unless we run this separately
|
# OOM in the CI unless we run this separately
|
||||||
- pytest -v -s tokenization
|
- pytest -v -s tokenization
|
||||||
|
|
||||||
- label: V1 Test
|
- label: V1 Test
|
||||||
mirror_hardwares: [amdexperimental]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
- pytest -v -s v1/core
|
- VLLM_USE_V1=1 pytest -v -s v1/core
|
||||||
- pytest -v -s v1/engine
|
- VLLM_USE_V1=1 pytest -v -s v1/engine
|
||||||
- pytest -v -s v1/entrypoints
|
- VLLM_USE_V1=1 pytest -v -s v1/sample
|
||||||
- pytest -v -s v1/sample
|
- VLLM_USE_V1=1 pytest -v -s v1/worker
|
||||||
- pytest -v -s v1/worker
|
- VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
|
||||||
- pytest -v -s v1/structured_output
|
- VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
|
||||||
- pytest -v -s v1/spec_decode
|
|
||||||
- pytest -v -s v1/kv_connector/unit
|
|
||||||
- pytest -v -s v1/test_serial_utils.py
|
|
||||||
- pytest -v -s v1/test_utils.py
|
|
||||||
- pytest -v -s v1/test_oracle.py
|
|
||||||
- pytest -v -s v1/test_metrics_reader.py
|
|
||||||
# TODO: accuracy does not match, whether setting
|
# TODO: accuracy does not match, whether setting
|
||||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||||
- pytest -v -s v1/e2e
|
- VLLM_USE_V1=1 pytest -v -s v1/e2e
|
||||||
# Integration test for streaming correctness (requires special branch).
|
|
||||||
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
|
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
|
||||||
|
|
||||||
- label: Examples Test # 25min
|
- label: Examples Test # 25min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/entrypoints
|
- vllm/entrypoints
|
||||||
- examples/
|
- examples/
|
||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
- python3 offline_inference/basic.py
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
- python3 offline_inference/cpu_offload.py
|
||||||
- python3 offline_inference/basic/chat.py
|
- python3 offline_inference/chat.py
|
||||||
- python3 offline_inference/prefix_caching.py
|
- python3 offline_inference/prefix_caching.py
|
||||||
- python3 offline_inference/llm_engine_example.py
|
- python3 offline_inference/llm_engine_example.py
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
- python3 offline_inference/vision_language.py
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language_multi_image.py
|
||||||
- python3 offline_inference/vision_language_embedding.py --seed 0
|
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
|
||||||
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
|
||||||
- python3 offline_inference/encoder_decoder.py
|
- python3 offline_inference/encoder_decoder.py
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
- python3 offline_inference/classification.py
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 offline_inference/embedding.py
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 offline_inference/scoring.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
|
||||||
|
|
||||||
- label: Prefix Caching Test # 9min
|
- label: Prefix Caching Test # 9min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/prefix_caching
|
- tests/prefix_caching
|
||||||
@ -264,7 +225,6 @@ steps:
|
|||||||
- pytest -v -s prefix_caching
|
- pytest -v -s prefix_caching
|
||||||
|
|
||||||
- label: Samplers Test # 36min
|
- label: Samplers Test # 36min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/layers
|
- vllm/model_executor/layers
|
||||||
- vllm/sampling_metadata.py
|
- vllm/sampling_metadata.py
|
||||||
@ -275,51 +235,36 @@ steps:
|
|||||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||||
|
|
||||||
- label: LogitsProcessor Test # 5min
|
- label: LogitsProcessor Test # 5min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/layers
|
- vllm/model_executor/layers
|
||||||
- vllm/model_executor/guided_decoding
|
- vllm/model_executor/guided_decoding
|
||||||
- tests/test_logits_processor
|
- tests/test_logits_processor
|
||||||
- tests/model_executor/test_guided_processors
|
- tests/model_executor/test_guided_processors
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s test_logits_processor.py
|
- pytest -v -s test_logits_processor.py
|
||||||
- pytest -v -s model_executor/test_guided_processors.py
|
- pytest -v -s model_executor/test_guided_processors.py
|
||||||
|
|
||||||
- label: Speculative decoding tests # 40min
|
- label: Speculative decoding tests # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/spec_decode
|
- vllm/spec_decode
|
||||||
- tests/spec_decode
|
- tests/spec_decode
|
||||||
- vllm/model_executor/models/eagle.py
|
- vllm/model_executor/models/eagle.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
||||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
|
||||||
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
||||||
|
|
||||||
- label: LoRA Test %N # 15min each
|
- label: LoRA Test %N # 15min each
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
- tests/lora
|
- tests/lora
|
||||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: PyTorch Compilation Unit Tests
|
- label: "PyTorch Fullgraph Smoke Test" # 9min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
fast_check: true
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/compile
|
|
||||||
commands:
|
|
||||||
- pytest -v -s compile/test_pass_manager.py
|
|
||||||
- pytest -v -s compile/test_fusion.py
|
|
||||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
|
||||||
- pytest -v -s compile/test_sequence_parallelism.py
|
|
||||||
- pytest -v -s compile/test_async_tp.py
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
@ -329,120 +274,59 @@ steps:
|
|||||||
- pytest -v -s compile/piecewise/test_simple.py
|
- pytest -v -s compile/piecewise/test_simple.py
|
||||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 18min
|
- label: "PyTorch Fullgraph Test" # 18min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_full_graph.py
|
- pytest -v -s compile/test_full_graph.py
|
||||||
|
|
||||||
- label: Kernels Core Operation Test
|
- label: Kernels Test %N # 1h each
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- tests/kernels/core
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/core
|
|
||||||
|
|
||||||
- label: Kernels Attention Test %N
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/attention/
|
|
||||||
- vllm/attention
|
- vllm/attention
|
||||||
- vllm/v1/attention
|
- tests/kernels
|
||||||
- tests/kernels/attention
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
parallelism: 2
|
parallelism: 4
|
||||||
|
|
||||||
- label: Kernels Quantization Test %N
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/kernels/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
||||||
parallelism: 2
|
|
||||||
|
|
||||||
- label: Kernels MoE Test
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/moe/
|
|
||||||
- tests/kernels/moe
|
|
||||||
- vllm/model_executor/layers/fused_moe/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/moe
|
|
||||||
|
|
||||||
- label: Kernels Mamba Test
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/mamba/
|
|
||||||
- tests/kernels/mamba
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/mamba
|
|
||||||
|
|
||||||
- label: Tensorizer Test # 11min
|
- label: Tensorizer Test # 11min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amd]
|
||||||
soft_fail: true
|
soft_fail: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/model_loader
|
- vllm/model_executor/model_loader
|
||||||
- tests/tensorizer_loader
|
- tests/tensorizer_loader
|
||||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
|
||||||
commands:
|
commands:
|
||||||
- apt-get update && apt-get install -y curl libsodium23
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s tensorizer_loader
|
- pytest -v -s tensorizer_loader
|
||||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
|
||||||
|
|
||||||
- label: Benchmarks # 9min
|
- label: Benchmarks # 9min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
working_dir: "/vllm-workspace/.buildkite"
|
working_dir: "/vllm-workspace/.buildkite"
|
||||||
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- benchmarks/
|
- benchmarks/
|
||||||
commands:
|
commands:
|
||||||
- bash scripts/run-benchmarks.sh
|
- bash run-benchmarks.sh
|
||||||
|
|
||||||
- label: Benchmarks CLI Test # 10min
|
- label: Quantization Test # 33min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/benchmarks/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s benchmarks/
|
|
||||||
|
|
||||||
- label: Quantization Test
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
- tests/quantization
|
- tests/quantization
|
||||||
commands:
|
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||||
|
|
||||||
- label: OpenAI API correctness
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/entrypoints/openai/
|
|
||||||
- vllm/model_executor/models/whisper.py
|
|
||||||
commands: # LMEval+Transcription WER check
|
|
||||||
- pytest -s entrypoints/openai/correctness/
|
|
||||||
|
|
||||||
- label: Encoder Decoder tests # 5min
|
- label: Encoder Decoder tests # 5min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/encoder_decoder
|
- tests/encoder_decoder
|
||||||
@ -450,117 +334,96 @@ steps:
|
|||||||
- pytest -v -s encoder_decoder
|
- pytest -v -s encoder_decoder
|
||||||
|
|
||||||
- label: OpenAI-Compatible Tool Use # 20 min
|
- label: OpenAI-Compatible Tool Use # 20 min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
fast_check: false
|
fast_check: false
|
||||||
|
mirror_hardwares: [ amd ]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/tool_use
|
- tests/tool_use
|
||||||
- tests/mistral_tool_use
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tool_use
|
- pytest -v -s tool_use
|
||||||
- pytest -v -s mistral_tool_use
|
|
||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
- label: Basic Models Test # 24min
|
- label: Basic Models Test # 24min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models
|
- tests/models
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/test_transformers.py
|
|
||||||
- pytest -v -s models/test_registry.py
|
- pytest -v -s models/test_registry.py
|
||||||
- pytest -v -s models/test_utils.py
|
|
||||||
- pytest -v -s models/test_vision.py
|
|
||||||
- pytest -v -s models/test_initialization.py
|
- pytest -v -s models/test_initialization.py
|
||||||
|
|
||||||
- label: Language Models Test (Standard)
|
- label: Language Models Test (Standard) # 32min
|
||||||
mirror_hardwares: [amdexperimental]
|
#mirror_hardwares: [amd]
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language
|
- tests/models/decoder_only/language
|
||||||
|
- tests/models/embedding/language
|
||||||
|
- tests/models/encoder_decoder/language
|
||||||
commands:
|
commands:
|
||||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
|
||||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
- pytest -v -s models/embedding/language -m core_model
|
||||||
- pip freeze | grep -E 'torch'
|
|
||||||
- pytest -v -s models/language -m core_model
|
|
||||||
|
|
||||||
- label: Language Models Test (Extended Generation) # 1hr20min
|
- label: Language Models Test (Extended) # 1h10min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language/generation
|
- tests/models/decoder_only/language
|
||||||
|
- tests/models/embedding/language
|
||||||
|
- tests/models/encoder_decoder/language
|
||||||
commands:
|
commands:
|
||||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
||||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
- pytest -v -s models/embedding/language -m 'not core_model'
|
||||||
- pytest -v -s models/language/generation -m 'not core_model'
|
|
||||||
|
|
||||||
- label: Language Models Test (Extended Pooling) # 36min
|
- label: Multi-Modal Models Test (Standard) # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
#mirror_hardwares: [amd]
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language/pooling
|
- tests/models/decoder_only/audio_language
|
||||||
commands:
|
- tests/models/decoder_only/vision_language
|
||||||
- pytest -v -s models/language/pooling -m 'not core_model'
|
- tests/models/embedding/vision_language
|
||||||
|
- tests/models/encoder_decoder/audio_language
|
||||||
- label: Multi-Modal Models Test (Standard)
|
- tests/models/encoder_decoder/vision_language
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pip freeze | grep -E 'torch'
|
- pytest -v -s models/multimodal
|
||||||
- pytest -v -s models/multimodal/processing
|
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
||||||
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
|
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||||
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- pytest -v -s models/embedding/vision_language -m core_model
|
||||||
|
- pytest -v -s models/encoder_decoder/audio_language -m core_model
|
||||||
|
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||||
|
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1
|
- label: Multi-Modal Models Test (Extended) 1 # 48m
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/decoder_only/audio_language
|
||||||
|
- tests/models/decoder_only/vision_language
|
||||||
|
- tests/models/embedding/vision_language
|
||||||
|
- tests/models/encoder_decoder/vision_language
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
|
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
|
||||||
|
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
|
||||||
|
# HACK - run phi3v tests separately to sidestep this transformers bug
|
||||||
|
# https://github.com/huggingface/transformers/issues/34307
|
||||||
|
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
|
||||||
|
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
||||||
|
- pytest -v -s models/embedding/vision_language -m 'not core_model'
|
||||||
|
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
|
||||||
|
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 2
|
- label: Multi-Modal Models Test (Extended) 2 # 38m
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/decoder_only/vision_language
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 3
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
commands:
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
|
||||||
|
|
||||||
- label: Quantized Models Test
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/models/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -v -s models/quantization
|
|
||||||
|
|
||||||
# This test is used only in PR development phase to test individual models and should never run on main
|
# This test is used only in PR development phase to test individual models and should never run on main
|
||||||
- label: Custom Models Test
|
- label: Custom Models Test
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
optional: true
|
optional: true
|
||||||
commands:
|
commands:
|
||||||
- echo 'Testing custom models...'
|
- echo 'Testing custom models...'
|
||||||
@ -572,7 +435,6 @@ steps:
|
|||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
|
|
||||||
- label: Distributed Comm Ops Test # 7min
|
- label: Distributed Comm Ops Test # 7min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -583,7 +445,6 @@ steps:
|
|||||||
- pytest -v -s distributed/test_shm_broadcast.py
|
- pytest -v -s distributed/test_shm_broadcast.py
|
||||||
|
|
||||||
- label: 2 Node Tests (4 GPUs in total) # 16min
|
- label: 2 Node Tests (4 GPUs in total) # 16min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
num_nodes: 2
|
num_nodes: 2
|
||||||
@ -602,7 +463,7 @@ steps:
|
|||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs) # 40min
|
- label: Distributed Tests (2 GPUs) # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
#mirror_hardwares: [amd]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -616,51 +477,43 @@ steps:
|
|||||||
- vllm/worker/worker.py
|
- vllm/worker/worker.py
|
||||||
- vllm/worker/model_runner.py
|
- vllm/worker/model_runner.py
|
||||||
- entrypoints/llm/test_collective_rpc.py
|
- entrypoints/llm/test_collective_rpc.py
|
||||||
- tests/v1/test_async_llm_dp.py
|
|
||||||
- vllm/v1/engine/
|
|
||||||
commands:
|
commands:
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
|
- torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
# Avoid importing model tests that cause CUDA reinitialization error
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
# test sequence parallel
|
|
||||||
- pytest -v -s distributed/test_sequence_parallel.py
|
|
||||||
# this test fails consistently.
|
# this test fails consistently.
|
||||||
# TODO: investigate and fix
|
# TODO: investigate and fix
|
||||||
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
|
||||||
|
|
||||||
- label: Plugin Tests (2 GPUs) # 40min
|
- label: Plugin Tests (2 GPUs) # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/plugins/
|
- vllm/plugins/
|
||||||
- tests/plugins/
|
- tests/plugins/
|
||||||
commands:
|
commands:
|
||||||
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
|
# begin platform plugin tests, all the code in-between runs on dummy platform
|
||||||
- pip install -e ./plugins/vllm_add_dummy_platform
|
- pip install -e ./plugins/vllm_add_dummy_platform
|
||||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||||
- pip uninstall vllm_add_dummy_platform -y
|
- pip uninstall vllm_add_dummy_platform -y
|
||||||
# end platform plugin tests
|
# end platform plugin tests
|
||||||
# other tests continue here:
|
# other tests continue here:
|
||||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
- pytest -v -s distributed/test_distributed_oot.py
|
- pytest -v -s distributed/test_distributed_oot.py
|
||||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
|
||||||
|
|
||||||
- label: Multi-step Tests (4 GPUs) # 36min
|
- label: Multi-step Tests (4 GPUs) # 36min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -681,7 +534,6 @@ steps:
|
|||||||
- pytest -v -s multi_step/test_correctness_llm.py
|
- pytest -v -s multi_step/test_correctness_llm.py
|
||||||
|
|
||||||
- label: Pipeline Parallelism Test # 45min
|
- label: Pipeline Parallelism Test # 45min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -695,7 +547,6 @@ steps:
|
|||||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
|
||||||
- label: LoRA TP Test (Distributed)
|
- label: LoRA TP Test (Distributed)
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
@ -704,14 +555,16 @@ steps:
|
|||||||
# FIXIT: find out which code initialize cuda before running the test
|
# FIXIT: find out which code initialize cuda before running the test
|
||||||
# before the fix, we need to use spawn to test it
|
# before the fix, we need to use spawn to test it
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||||
|
- pytest -v -s -x lora/test_long_context.py
|
||||||
|
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||||
# requires multi-GPU testing for validation.
|
# requires multi-GPU testing for validation.
|
||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
|
- pytest -v -s -x lora/test_minicpmv_tp.py
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -721,7 +574,6 @@ steps:
|
|||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
gpu: a100
|
gpu: a100
|
||||||
@ -730,7 +582,7 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/weight_loading
|
- tests/weight_loading
|
||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
|
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
@ -742,7 +594,7 @@ steps:
|
|||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
commands:
|
commands:
|
||||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||||
@ -760,4 +612,4 @@ steps:
|
|||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
||||||
|
|||||||
@ -50,11 +50,8 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu126 wheels"
|
|
||||||
else
|
else
|
||||||
# only upload index.html for cu128 wheels (default wheels)
|
# only upload index.html for cu12 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
fi
|
fi
|
||||||
@ -66,13 +63,9 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu126 wheels"
|
|
||||||
else
|
else
|
||||||
# only upload index.html for cu128 wheels (default wheels)
|
# only upload index.html for cu12 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
|
|
||||||
34
.github/CODEOWNERS
vendored
34
.github/CODEOWNERS
vendored
@ -10,39 +10,27 @@
|
|||||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||||
/vllm/model_executor/guided_decoding @mgoin @russellb
|
/vllm/model_executor/guided_decoding @mgoin
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96
|
/vllm/multimodal @DarkLight1337 @ywang96
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
|
||||||
/vllm/lora @jeejeelee
|
|
||||||
CMakeLists.txt @tlrmchlsmth
|
CMakeLists.txt @tlrmchlsmth
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||||
/vllm/v1/structured_output @mgoin @russellb
|
|
||||||
|
|
||||||
# Test ownership
|
# Test ownership
|
||||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
|
||||||
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
||||||
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
|
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
|
||||||
|
/tests/models @DarkLight1337 @ywang96
|
||||||
|
/tests/multimodal @DarkLight1337 @ywang96
|
||||||
|
/tests/prefix_caching @comaniac @KuntaiDu
|
||||||
|
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
||||||
|
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||||
|
/tests/quantization @mgoin @robertgshaw2-redhat
|
||||||
|
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||||
/tests/distributed/test_same_node.py @youkaichao
|
/tests/distributed/test_same_node.py @youkaichao
|
||||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
|
|
||||||
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
|
|
||||||
/tests/kernels @tlrmchlsmth @WoosukKwon
|
|
||||||
/tests/model_executor/test_guided_processors.py @mgoin @russellb
|
|
||||||
/tests/models @DarkLight1337 @ywang96
|
|
||||||
/tests/multi_step @alexm-redhat @comaniac
|
/tests/multi_step @alexm-redhat @comaniac
|
||||||
/tests/multimodal @DarkLight1337 @ywang96
|
|
||||||
/tests/prefix_caching @comaniac @KuntaiDu
|
|
||||||
/tests/quantization @mgoin @robertgshaw2-redhat
|
|
||||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
|
||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
|
||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
|
|
||||||
/tests/v1/structured_output @mgoin @russellb
|
|
||||||
/tests/weight_loading @mgoin @youkaichao
|
/tests/weight_loading @mgoin @youkaichao
|
||||||
/tests/lora @jeejeelee
|
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
||||||
|
|
||||||
# Docs
|
|
||||||
/docs @hmellor
|
|
||||||
mkdocs.yaml @hmellor
|
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
@ -14,7 +14,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
@ -14,7 +14,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
23
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
23
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
@ -14,22 +14,31 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||||
value: |
|
value: |
|
||||||
<details>
|
<details>
|
||||||
<summary>The output of <code>python collect_env.py</code></summary>
|
<summary>The output of `python collect_env.py`</summary>
|
||||||
|
|
||||||
```text
|
```text
|
||||||
Your output of `python collect_env.py` here
|
Your output of `python collect_env.py` here
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Model Input Dumps
|
||||||
|
description: |
|
||||||
|
If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
|
||||||
|
placeholder: |
|
||||||
|
Upload the dumped input file.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
- type: textarea
|
- type: textarea
|
||||||
attributes:
|
attributes:
|
||||||
label: 🐛 Describe the bug
|
label: 🐛 Describe the bug
|
||||||
@ -75,20 +84,20 @@ body:
|
|||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
The error message you got, with the full traceback.
|
||||||
```
|
```
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
- type: markdown
|
- type: markdown
|
||||||
attributes:
|
attributes:
|
||||||
value: |
|
value: >
|
||||||
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
|
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
|
||||||
|
|
||||||
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
|
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
|
||||||
|
|
||||||
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
|
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
|
||||||
|
|
||||||
Thanks for reporting 🙏!
|
Thanks for contributing 🎉!
|
||||||
- type: checkboxes
|
- type: checkboxes
|
||||||
id: askllm
|
id: askllm
|
||||||
attributes:
|
attributes:
|
||||||
|
|||||||
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
@ -1,69 +0,0 @@
|
|||||||
name: 🧪 CI failure report
|
|
||||||
description: Report a failing test.
|
|
||||||
title: "[CI Failure]: "
|
|
||||||
labels: ["ci-failure"]
|
|
||||||
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
#### Include the name of the failing Buildkite step and test file in the title.
|
|
||||||
- type: input
|
|
||||||
attributes:
|
|
||||||
label: Name of failing test
|
|
||||||
description: |
|
|
||||||
Paste in the fully-qualified name of the failing test from the logs.
|
|
||||||
placeholder: |
|
|
||||||
`path/to/test_file.py::test_name[params]`
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: checkboxes
|
|
||||||
attributes:
|
|
||||||
label: Basic information
|
|
||||||
description: Select all items that apply to the failing test.
|
|
||||||
options:
|
|
||||||
- label: Flaky test
|
|
||||||
- label: Can reproduce locally
|
|
||||||
- label: Caused by external libraries (e.g. bug in `transformers`)
|
|
||||||
- type: textarea
|
|
||||||
attributes:
|
|
||||||
label: 🧪 Describe the failing test
|
|
||||||
description: |
|
|
||||||
Please provide a clear and concise description of the failing test.
|
|
||||||
placeholder: |
|
|
||||||
A clear and concise description of the failing test.
|
|
||||||
|
|
||||||
```
|
|
||||||
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
|
||||||
```
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
attributes:
|
|
||||||
label: 📝 History of failing test
|
|
||||||
description: |
|
|
||||||
Since when did the test start to fail?
|
|
||||||
You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
|
|
||||||
|
|
||||||
If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
|
|
||||||
|
|
||||||
- Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
|
|
||||||
|
|
||||||
- Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
|
|
||||||
|
|
||||||
- Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
|
|
||||||
placeholder: |
|
|
||||||
Approximate timeline and/or problematic PRs
|
|
||||||
|
|
||||||
A link to the Buildkite analytics of the failing test (if available)
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
attributes:
|
|
||||||
label: CC List.
|
|
||||||
description: >
|
|
||||||
The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
Thanks for reporting 🙏!
|
|
||||||
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
@ -9,7 +9,7 @@ body:
|
|||||||
value: >
|
value: >
|
||||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
|
||||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
|
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
attributes:
|
attributes:
|
||||||
label: The model to consider.
|
label: The model to consider.
|
||||||
|
|||||||
@ -35,7 +35,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
28
.github/ISSUE_TEMPLATE/800-misc-discussion.yml
vendored
Normal file
28
.github/ISSUE_TEMPLATE/800-misc-discussion.yml
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
name: 🎲 Misc/random discussions that do not fit into the above categories.
|
||||||
|
description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
|
||||||
|
title: "[Misc]: "
|
||||||
|
labels: ["misc"]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Anything you want to discuss about vllm.
|
||||||
|
description: >
|
||||||
|
Anything you want to discuss about vllm.
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for contributing 🎉!
|
||||||
|
- type: checkboxes
|
||||||
|
id: askllm
|
||||||
|
attributes:
|
||||||
|
label: Before submitting a new issue...
|
||||||
|
options:
|
||||||
|
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
|
||||||
|
required: true
|
||||||
4
.github/ISSUE_TEMPLATE/config.yml
vendored
4
.github/ISSUE_TEMPLATE/config.yml
vendored
@ -1,5 +1 @@
|
|||||||
blank_issues_enabled: false
|
blank_issues_enabled: false
|
||||||
contact_links:
|
|
||||||
- name: Questions
|
|
||||||
url: https://discuss.vllm.ai
|
|
||||||
about: Ask questions and discuss with other vLLM community members
|
|
||||||
|
|||||||
3
.github/PULL_REQUEST_TEMPLATE.md
vendored
3
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -2,5 +2,4 @@ FILL IN THE PR DESCRIPTION HERE
|
|||||||
|
|
||||||
FIX #xxxx (*link existing issues this PR will resolve*)
|
FIX #xxxx (*link existing issues this PR will resolve*)
|
||||||
|
|
||||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html **
|
||||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
|
||||||
|
|||||||
2
.github/dependabot.yml
vendored
2
.github/dependabot.yml
vendored
@ -23,7 +23,7 @@ updates:
|
|||||||
- dependency-name: "lm-format-enforcer"
|
- dependency-name: "lm-format-enforcer"
|
||||||
- dependency-name: "gguf"
|
- dependency-name: "gguf"
|
||||||
- dependency-name: "compressed-tensors"
|
- dependency-name: "compressed-tensors"
|
||||||
- dependency-name: "ray[cgraph]" # Ray Compiled Graph
|
- dependency-name: "ray[adag]"
|
||||||
- dependency-name: "lm-eval"
|
- dependency-name: "lm-eval"
|
||||||
groups:
|
groups:
|
||||||
minor-update:
|
minor-update:
|
||||||
|
|||||||
124
.github/mergify.yml
vendored
124
.github/mergify.yml
vendored
@ -5,7 +5,6 @@ pull_request_rules:
|
|||||||
- or:
|
- or:
|
||||||
- files~=^[^/]+\.md$
|
- files~=^[^/]+\.md$
|
||||||
- files~=^docs/
|
- files~=^docs/
|
||||||
- files~=^examples/
|
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@ -19,7 +18,7 @@ pull_request_rules:
|
|||||||
- files~=\.buildkite/
|
- files~=\.buildkite/
|
||||||
- files~=^cmake/
|
- files~=^cmake/
|
||||||
- files=CMakeLists.txt
|
- files=CMakeLists.txt
|
||||||
- files~=^docker/Dockerfile
|
- files~=^Dockerfile
|
||||||
- files~=^requirements.*\.txt
|
- files~=^requirements.*\.txt
|
||||||
- files=setup.py
|
- files=setup.py
|
||||||
actions:
|
actions:
|
||||||
@ -36,116 +35,6 @@ pull_request_rules:
|
|||||||
add:
|
add:
|
||||||
- frontend
|
- frontend
|
||||||
|
|
||||||
- name: label-multi-modality
|
|
||||||
description: Automatically apply multi-modality label
|
|
||||||
conditions:
|
|
||||||
- or:
|
|
||||||
- files~=^vllm/multimodal/
|
|
||||||
- files~=^tests/multimodal/
|
|
||||||
- files~=^tests/models/multimodal/
|
|
||||||
- files~=^tests/models/*/audio_language/
|
|
||||||
- files~=^tests/models/*/vision_language/
|
|
||||||
- files=tests/models/test_vision.py
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
add:
|
|
||||||
- multi-modality
|
|
||||||
|
|
||||||
- name: label-structured-output
|
|
||||||
description: Automatically apply structured-output label
|
|
||||||
conditions:
|
|
||||||
- or:
|
|
||||||
- files~=^benchmarks/structured_schemas/
|
|
||||||
- files=benchmarks/benchmark_serving_structured_output.py
|
|
||||||
- files=benchmarks/run_structured_output_benchmark.sh
|
|
||||||
- files=docs/features/structured_outputs.md
|
|
||||||
- files=examples/offline_inference/structured_outputs.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
|
||||||
- files~=^vllm/model_executor/guided_decoding/
|
|
||||||
- files=tests/model_executor/test_guided_processors.py
|
|
||||||
- files=tests/entrypoints/llm/test_guided_generate.py
|
|
||||||
- files~=^tests/v1/structured_output/
|
|
||||||
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
|
||||||
- files~=^vllm/v1/structured_output/
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
add:
|
|
||||||
- structured-output
|
|
||||||
|
|
||||||
- name: label-speculative-decoding
|
|
||||||
description: Automatically apply speculative-decoding label
|
|
||||||
conditions:
|
|
||||||
- or:
|
|
||||||
- files~=^vllm/spec_decode/
|
|
||||||
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
|
|
||||||
- files~=^tests/spec_decode/
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
add:
|
|
||||||
- speculative-decoding
|
|
||||||
|
|
||||||
- name: label-v1
|
|
||||||
description: Automatically apply v1 label
|
|
||||||
conditions:
|
|
||||||
- or:
|
|
||||||
- files~=^vllm/v1/
|
|
||||||
- files~=^tests/v1/
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
add:
|
|
||||||
- v1
|
|
||||||
|
|
||||||
- name: label-tpu
|
|
||||||
description: Automatically apply tpu label
|
|
||||||
# Keep this list in sync with `label-tpu-remove` conditions
|
|
||||||
conditions:
|
|
||||||
- or:
|
|
||||||
- files~=tpu.py
|
|
||||||
- files~=_tpu
|
|
||||||
- files~=tpu_
|
|
||||||
- files~=/tpu/
|
|
||||||
- files~=pallas
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
add:
|
|
||||||
- tpu
|
|
||||||
|
|
||||||
- name: label-tpu-remove
|
|
||||||
description: Automatically remove tpu label
|
|
||||||
# Keep this list in sync with `label-tpu` conditions
|
|
||||||
conditions:
|
|
||||||
- and:
|
|
||||||
- -files~=tpu.py
|
|
||||||
- -files~=_tpu
|
|
||||||
- -files~=tpu_
|
|
||||||
- -files~=/tpu/
|
|
||||||
- -files~=pallas
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
remove:
|
|
||||||
- tpu
|
|
||||||
|
|
||||||
- name: label-tool-calling
|
|
||||||
description: Automatically add tool-calling label
|
|
||||||
conditions:
|
|
||||||
- or:
|
|
||||||
- files~=^tests/tool_use/
|
|
||||||
- files~=^tests/mistral_tool_use/
|
|
||||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
|
||||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
|
||||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
|
||||||
- files=docs/features/tool_calling.md
|
|
||||||
- files~=^examples/tool_chat_*
|
|
||||||
- files=examples/offline_inference/chat_with_tools.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
add:
|
|
||||||
- tool-calling
|
|
||||||
|
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
conditions:
|
conditions:
|
||||||
- conflict
|
- conflict
|
||||||
@ -161,17 +50,6 @@ pull_request_rules:
|
|||||||
|
|
||||||
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
|
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
|
||||||
|
|
||||||
- name: assign reviewer for tensorizer changes
|
|
||||||
conditions:
|
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
|
||||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
|
||||||
- files~=^tests/tensorizer_loader/
|
|
||||||
actions:
|
|
||||||
assign:
|
|
||||||
users:
|
|
||||||
- "sangstar"
|
|
||||||
|
|
||||||
- name: remove 'needs-rebase' label when conflict is resolved
|
- name: remove 'needs-rebase' label when conflict is resolved
|
||||||
conditions:
|
conditions:
|
||||||
- -conflict
|
- -conflict
|
||||||
|
|||||||
2
.github/scripts/cleanup_pr_body.sh
vendored
2
.github/scripts/cleanup_pr_body.sh
vendored
@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
|
|||||||
|
|
||||||
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
|
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
|
||||||
python3 - <<EOF
|
python3 - <<EOF
|
||||||
import regex as re
|
import re
|
||||||
|
|
||||||
with open("${NEW}", "r") as file:
|
with open("${NEW}", "r") as file:
|
||||||
content = file.read()
|
content = file.read()
|
||||||
|
|||||||
2
.github/workflows/add_label_automerge.yml
vendored
2
.github/workflows/add_label_automerge.yml
vendored
@ -1,6 +1,4 @@
|
|||||||
name: Add label on auto-merge enabled
|
name: Add label on auto-merge enabled
|
||||||
permissions:
|
|
||||||
pull-requests: write
|
|
||||||
on:
|
on:
|
||||||
pull_request_target:
|
pull_request_target:
|
||||||
types:
|
types:
|
||||||
|
|||||||
9
.github/workflows/cleanup_pr_body.yml
vendored
9
.github/workflows/cleanup_pr_body.yml
vendored
@ -16,16 +16,11 @@ jobs:
|
|||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||||
with:
|
with:
|
||||||
python-version: '3.12'
|
python-version: '3.12'
|
||||||
|
|
||||||
- name: Install Python dependencies
|
|
||||||
run: |
|
|
||||||
python3 -m pip install --upgrade pip
|
|
||||||
python3 -m pip install regex
|
|
||||||
|
|
||||||
- name: Update PR description
|
- name: Update PR description
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
||||||
|
|||||||
17
.github/workflows/lint-and-deploy.yaml
vendored
17
.github/workflows/lint-and-deploy.yaml
vendored
@ -2,9 +2,6 @@ name: Lint and Deploy Charts
|
|||||||
|
|
||||||
on: pull_request
|
on: pull_request
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
lint-and-deploy:
|
lint-and-deploy:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@ -15,17 +12,17 @@ jobs:
|
|||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Set up Helm
|
- name: Set up Helm
|
||||||
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
|
uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
|
||||||
with:
|
with:
|
||||||
version: v3.14.4
|
version: v3.14.4
|
||||||
|
|
||||||
#Python is required because ct lint runs Yamale and yamllint which require Python.
|
#Python is required because ct lint runs Yamale and yamllint which require Python.
|
||||||
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||||
with:
|
with:
|
||||||
python-version: '3.13'
|
python-version: '3.13'
|
||||||
|
|
||||||
- name: Set up chart-testing
|
- name: Set up chart-testing
|
||||||
uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
|
uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
|
||||||
with:
|
with:
|
||||||
version: v3.10.1
|
version: v3.10.1
|
||||||
|
|
||||||
@ -50,10 +47,10 @@ jobs:
|
|||||||
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
|
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
|
||||||
|
|
||||||
- name: Create kind cluster
|
- name: Create kind cluster
|
||||||
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
|
||||||
|
|
||||||
- name: Build the Docker image vllm cpu
|
- name: Build the Docker image vllm cpu
|
||||||
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
|
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
|
||||||
|
|
||||||
- name: Configuration of docker images, network and namespace for the kind cluster
|
- name: Configuration of docker images, network and namespace for the kind cluster
|
||||||
run: |
|
run: |
|
||||||
@ -69,7 +66,7 @@ jobs:
|
|||||||
export AWS_SECRET_ACCESS_KEY=minioadmin
|
export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||||
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
||||||
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
||||||
|
|
||||||
- name: curl test
|
- name: curl test
|
||||||
run: |
|
run: |
|
||||||
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
|
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
|
||||||
@ -82,4 +79,4 @@ jobs:
|
|||||||
"max_tokens": 7,
|
"max_tokens": 7,
|
||||||
"temperature": 0
|
"temperature": 0
|
||||||
}'):$CODE"
|
}'):$CODE"
|
||||||
echo "$CODE"
|
echo "$CODE"
|
||||||
6
.github/workflows/pre-commit.yml
vendored
6
.github/workflows/pre-commit.yml
vendored
@ -5,19 +5,15 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches: [main]
|
branches: [main]
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
pre-commit:
|
pre-commit:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||||
with:
|
with:
|
||||||
python-version: "3.12"
|
python-version: "3.12"
|
||||||
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||||
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
|
||||||
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
||||||
with:
|
with:
|
||||||
extra_args: --all-files --hook-stage manual
|
extra_args: --all-files --hook-stage manual
|
||||||
|
|||||||
4
.github/workflows/publish.yml
vendored
4
.github/workflows/publish.yml
vendored
@ -39,7 +39,7 @@ jobs:
|
|||||||
const script = require('.github/workflows/scripts/create_release.js')
|
const script = require('.github/workflows/scripts/create_release.js')
|
||||||
await script(github, context, core)
|
await script(github, context, core)
|
||||||
|
|
||||||
# NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow.
|
# NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow.
|
||||||
# wheel:
|
# wheel:
|
||||||
# name: Build Wheel
|
# name: Build Wheel
|
||||||
# runs-on: ${{ matrix.os }}
|
# runs-on: ${{ matrix.os }}
|
||||||
@ -50,7 +50,7 @@ jobs:
|
|||||||
# matrix:
|
# matrix:
|
||||||
# os: ['ubuntu-20.04']
|
# os: ['ubuntu-20.04']
|
||||||
# python-version: ['3.9', '3.10', '3.11', '3.12']
|
# python-version: ['3.9', '3.10', '3.11', '3.12']
|
||||||
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements/cuda.txt.
|
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
|
||||||
# cuda-version: ['11.8', '12.1']
|
# cuda-version: ['11.8', '12.1']
|
||||||
|
|
||||||
# steps:
|
# steps:
|
||||||
|
|||||||
10
.github/workflows/reminder_comment.yml
vendored
10
.github/workflows/reminder_comment.yml
vendored
@ -1,9 +1,8 @@
|
|||||||
name: PR Reminder Comment Bot
|
name: PR Reminder Comment Bot
|
||||||
permissions:
|
|
||||||
pull-requests: write
|
|
||||||
on:
|
on:
|
||||||
pull_request_target:
|
pull_request_target:
|
||||||
types: [opened]
|
types: [opened]
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
pr_reminder:
|
pr_reminder:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@ -16,12 +15,7 @@ jobs:
|
|||||||
owner: context.repo.owner,
|
owner: context.repo.owner,
|
||||||
repo: context.repo.repo,
|
repo: context.repo.repo,
|
||||||
issue_number: context.issue.number,
|
issue_number: context.issue.number,
|
||||||
body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
|
body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
|
||||||
'💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
|
|
||||||
'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
|
|
||||||
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
|
|
||||||
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
|
|
||||||
'🚀'
|
|
||||||
})
|
})
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|||||||
2
.github/workflows/scripts/build.sh
vendored
2
.github/workflows/scripts/build.sh
vendored
@ -9,7 +9,7 @@ PATH=${cuda_home}/bin:$PATH
|
|||||||
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
|
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
# Install requirements
|
# Install requirements
|
||||||
$python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt
|
$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
|
||||||
|
|
||||||
# Limit the number of parallel jobs to avoid OOM
|
# Limit the number of parallel jobs to avoid OOM
|
||||||
export MAX_JOBS=1
|
export MAX_JOBS=1
|
||||||
|
|||||||
2
.github/workflows/scripts/create_release.js
vendored
2
.github/workflows/scripts/create_release.js
vendored
@ -1,4 +1,4 @@
|
|||||||
// Uses GitHub's API to create the release and wait for result.
|
// Uses Github's API to create the release and wait for result.
|
||||||
// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
|
// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
|
||||||
|
|
||||||
module.exports = async (github, context, core) => {
|
module.exports = async (github, context, core) => {
|
||||||
|
|||||||
2
.github/workflows/stale.yml
vendored
2
.github/workflows/stale.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
|||||||
actions: write
|
actions: write
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
|
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
|
||||||
with:
|
with:
|
||||||
# Increasing this value ensures that changes to this workflow
|
# Increasing this value ensures that changes to this workflow
|
||||||
# propagate to all issues and PRs in days rather than months
|
# propagate to all issues and PRs in days rather than months
|
||||||
|
|||||||
12
.gitignore
vendored
12
.gitignore
vendored
@ -2,7 +2,7 @@
|
|||||||
/vllm/_version.py
|
/vllm/_version.py
|
||||||
|
|
||||||
# vllm-flash-attn built from source
|
# vllm-flash-attn built from source
|
||||||
vllm/vllm_flash_attn/*
|
vllm/vllm_flash_attn/
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
@ -77,6 +77,10 @@ instance/
|
|||||||
# Scrapy stuff:
|
# Scrapy stuff:
|
||||||
.scrapy
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
docs/source/getting_started/examples/
|
||||||
|
|
||||||
# PyBuilder
|
# PyBuilder
|
||||||
.pybuilder/
|
.pybuilder/
|
||||||
target/
|
target/
|
||||||
@ -146,7 +150,6 @@ venv.bak/
|
|||||||
|
|
||||||
# mkdocs documentation
|
# mkdocs documentation
|
||||||
/site
|
/site
|
||||||
docs/examples
|
|
||||||
|
|
||||||
# mypy
|
# mypy
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
@ -194,11 +197,8 @@ _build/
|
|||||||
hip_compat.h
|
hip_compat.h
|
||||||
|
|
||||||
# Benchmark dataset
|
# Benchmark dataset
|
||||||
benchmarks/**/*.json
|
benchmarks/*.json
|
||||||
|
|
||||||
# Linting
|
# Linting
|
||||||
actionlint
|
actionlint
|
||||||
shellcheck*/
|
shellcheck*/
|
||||||
|
|
||||||
# Ingore moe/marlin_moe gen code
|
|
||||||
csrc/moe/marlin_moe_wna16/kernel_*
|
|
||||||
|
|||||||
@ -1,56 +1,43 @@
|
|||||||
default_install_hook_types:
|
|
||||||
- pre-commit
|
|
||||||
- commit-msg
|
|
||||||
default_stages:
|
default_stages:
|
||||||
- pre-commit # Run locally
|
- pre-commit # Run locally
|
||||||
- manual # Run in CI
|
- manual # Run in CI
|
||||||
exclude: 'vllm/third_party/.*'
|
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/google/yapf
|
- repo: https://github.com/google/yapf
|
||||||
rev: v0.43.0
|
rev: v0.43.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: yapf
|
- id: yapf
|
||||||
args: [--in-place, --verbose]
|
args: [--in-place, --verbose]
|
||||||
|
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.11.7
|
rev: v0.9.3
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff
|
- id: ruff
|
||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github]
|
||||||
- id: ruff-format
|
|
||||||
files: ^(.buildkite|benchmarks|examples)/.*
|
|
||||||
- repo: https://github.com/codespell-project/codespell
|
- repo: https://github.com/codespell-project/codespell
|
||||||
rev: v2.4.1
|
rev: v2.4.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: codespell
|
- id: codespell
|
||||||
additional_dependencies: ['tomli']
|
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
|
||||||
args: ['--toml', 'pyproject.toml']
|
|
||||||
- repo: https://github.com/PyCQA/isort
|
- repo: https://github.com/PyCQA/isort
|
||||||
rev: 6.0.1
|
rev: 5.13.2
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
rev: v20.1.3
|
rev: v19.1.7
|
||||||
hooks:
|
hooks:
|
||||||
- id: clang-format
|
- id: clang-format
|
||||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
|
||||||
types_or: [c++, cuda]
|
types_or: [c++, cuda]
|
||||||
args: [--style=file, --verbose]
|
args: [--style=file, --verbose]
|
||||||
- repo: https://github.com/jackdewinter/pymarkdown
|
- repo: https://github.com/jackdewinter/pymarkdown
|
||||||
rev: v0.9.29
|
rev: v0.9.27
|
||||||
hooks:
|
hooks:
|
||||||
- id: pymarkdown
|
- id: pymarkdown
|
||||||
exclude: '.*\.inc\.md'
|
files: docs/.*
|
||||||
args: [fix]
|
|
||||||
- repo: https://github.com/rhysd/actionlint
|
- repo: https://github.com/rhysd/actionlint
|
||||||
rev: v1.7.7
|
rev: v1.7.7
|
||||||
hooks:
|
hooks:
|
||||||
- id: actionlint
|
- id: actionlint
|
||||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
|
||||||
rev: 0.6.17
|
|
||||||
hooks:
|
|
||||||
- id: pip-compile
|
|
||||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
|
|
||||||
files: ^requirements/test\.(in|txt)$
|
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
- id: mypy-local
|
- id: mypy-local
|
||||||
@ -58,7 +45,7 @@ repos:
|
|||||||
entry: tools/mypy.sh 0 "local"
|
entry: tools/mypy.sh 0 "local"
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
|
additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
|
||||||
stages: [pre-commit] # Don't run in CI
|
stages: [pre-commit] # Don't run in CI
|
||||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.9
|
name: Run mypy for Python 3.9
|
||||||
@ -98,56 +85,9 @@ repos:
|
|||||||
entry: tools/png-lint.sh
|
entry: tools/png-lint.sh
|
||||||
language: script
|
language: script
|
||||||
types: [png]
|
types: [png]
|
||||||
- id: signoff-commit
|
|
||||||
name: Sign-off Commit
|
|
||||||
entry: bash
|
|
||||||
args:
|
|
||||||
- -c
|
|
||||||
- |
|
|
||||||
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
|
|
||||||
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
|
|
||||||
fi
|
|
||||||
language: system
|
|
||||||
verbose: true
|
|
||||||
stages: [commit-msg]
|
|
||||||
- id: check-spdx-header
|
|
||||||
name: Check SPDX headers
|
|
||||||
entry: python tools/check_spdx_header.py
|
|
||||||
language: python
|
|
||||||
types: [python]
|
|
||||||
- id: check-filenames
|
|
||||||
name: Check for spaces in all filenames
|
|
||||||
entry: bash
|
|
||||||
args:
|
|
||||||
- -c
|
|
||||||
- 'git ls-files | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0'
|
|
||||||
language: system
|
|
||||||
always_run: true
|
|
||||||
pass_filenames: false
|
|
||||||
- id: update-dockerfile-graph
|
|
||||||
name: Update Dockerfile dependency graph
|
|
||||||
entry: tools/update-dockerfile-graph.sh
|
|
||||||
language: script
|
|
||||||
- id: enforce-import-regex-instead-of-re
|
|
||||||
name: Enforce import regex as re
|
|
||||||
entry: python tools/enforce_regex_import.py
|
|
||||||
language: python
|
|
||||||
types: [python]
|
|
||||||
pass_filenames: false
|
|
||||||
additional_dependencies: [regex]
|
|
||||||
# forbid directly import triton
|
|
||||||
- id: forbid-direct-triton-import
|
|
||||||
name: "Forbid direct 'import triton'"
|
|
||||||
entry: python tools/check_triton_import.py
|
|
||||||
language: python
|
|
||||||
types: [python]
|
|
||||||
pass_filenames: false
|
|
||||||
additional_dependencies: [regex]
|
|
||||||
# Keep `suggestion` last
|
|
||||||
- id: suggestion
|
- id: suggestion
|
||||||
name: Suggestion
|
name: Suggestion
|
||||||
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
|
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
|
||||||
language: system
|
language: system
|
||||||
verbose: true
|
verbose: true
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
# Insert new entries above the `suggestion` entry
|
|
||||||
|
|||||||
@ -8,10 +8,14 @@ build:
|
|||||||
tools:
|
tools:
|
||||||
python: "3.12"
|
python: "3.12"
|
||||||
|
|
||||||
mkdocs:
|
sphinx:
|
||||||
configuration: mkdocs.yaml
|
configuration: docs/source/conf.py
|
||||||
|
fail_on_warning: true
|
||||||
|
|
||||||
|
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
||||||
|
formats: []
|
||||||
|
|
||||||
# Optionally declare the Python requirements required to build your docs
|
# Optionally declare the Python requirements required to build your docs
|
||||||
python:
|
python:
|
||||||
install:
|
install:
|
||||||
- requirements: requirements/docs.txt
|
- requirements: docs/requirements-docs.txt
|
||||||
|
|||||||
434
CMakeLists.txt
Normal file → Executable file
434
CMakeLists.txt
Normal file → Executable file
@ -15,6 +15,7 @@ project(vllm_extensions LANGUAGES CXX)
|
|||||||
|
|
||||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||||
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
||||||
|
|
||||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
||||||
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
||||||
|
|
||||||
@ -29,8 +30,11 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
|
|||||||
#
|
#
|
||||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
||||||
|
|
||||||
|
# Supported NVIDIA architectures.
|
||||||
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported/expected torch versions for CUDA/ROCm.
|
# Supported/expected torch versions for CUDA/ROCm.
|
||||||
@ -40,10 +44,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
|||||||
#
|
#
|
||||||
# Note: the CUDA torch version is derived from pyproject.toml and various
|
# Note: the CUDA torch version is derived from pyproject.toml and various
|
||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from docker/Dockerfile.rocm
|
# versions are derived from Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@ -76,15 +80,6 @@ endif()
|
|||||||
#
|
#
|
||||||
find_package(Torch REQUIRED)
|
find_package(Torch REQUIRED)
|
||||||
|
|
||||||
# Supported NVIDIA architectures.
|
|
||||||
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
|
||||||
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
|
||||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
|
||||||
else()
|
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Forward the non-CUDA device extensions to external CMake scripts.
|
# Forward the non-CUDA device extensions to external CMake scripts.
|
||||||
#
|
#
|
||||||
@ -179,25 +174,6 @@ include(FetchContent)
|
|||||||
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
||||||
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
||||||
|
|
||||||
#
|
|
||||||
# Set rocm version dev int.
|
|
||||||
#
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "HIP")
|
|
||||||
#
|
|
||||||
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
|
|
||||||
#
|
|
||||||
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
|
|
||||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
|
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
|
|
||||||
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
|
|
||||||
#
|
|
||||||
set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result")
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Define other extension targets
|
# Define other extension targets
|
||||||
#
|
#
|
||||||
@ -216,7 +192,7 @@ set_gencode_flags_for_srcs(
|
|||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
message(STATUS "Enabling cumem allocator extension.")
|
message(STATUS "Enabling cumem allocator extension.")
|
||||||
# link against cuda driver library
|
# link against cuda driver library
|
||||||
list(APPEND CUMEM_LIBS CUDA::cuda_driver)
|
list(APPEND CUMEM_LIBS cuda)
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
cumem_allocator
|
cumem_allocator
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
@ -232,34 +208,27 @@ endif()
|
|||||||
#
|
#
|
||||||
|
|
||||||
set(VLLM_EXT_SRC
|
set(VLLM_EXT_SRC
|
||||||
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
|
||||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
|
||||||
"csrc/cache_kernels.cu"
|
"csrc/cache_kernels.cu"
|
||||||
"csrc/attention/paged_attention_v1.cu"
|
"csrc/attention/paged_attention_v1.cu"
|
||||||
"csrc/attention/paged_attention_v2.cu"
|
"csrc/attention/paged_attention_v2.cu"
|
||||||
"csrc/attention/merge_attn_states.cu"
|
|
||||||
"csrc/attention/vertical_slash_index.cu"
|
|
||||||
"csrc/pos_encoding_kernels.cu"
|
"csrc/pos_encoding_kernels.cu"
|
||||||
"csrc/activation_kernels.cu"
|
"csrc/activation_kernels.cu"
|
||||||
"csrc/layernorm_kernels.cu"
|
"csrc/layernorm_kernels.cu"
|
||||||
"csrc/layernorm_quant_kernels.cu"
|
"csrc/layernorm_quant_kernels.cu"
|
||||||
"csrc/cuda_view.cu"
|
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
"csrc/quantization/fp8/common.cu"
|
"csrc/quantization/fp8/common.cu"
|
||||||
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
||||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||||
"csrc/quantization/activation_kernels.cu"
|
|
||||||
"csrc/cuda_utils_kernels.cu"
|
"csrc/cuda_utils_kernels.cu"
|
||||||
"csrc/prepare_inputs/advance_step.cu"
|
"csrc/prepare_inputs/advance_step.cu"
|
||||||
"csrc/custom_all_reduce.cu"
|
|
||||||
"csrc/torch_bindings.cpp")
|
"csrc/torch_bindings.cpp")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
||||||
set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
|
set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
|
||||||
|
|
||||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||||
@ -276,8 +245,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
cutlass
|
cutlass
|
||||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||||
# Please keep this in sync with CUTLASS_REVISION line above.
|
GIT_TAG v3.6.0
|
||||||
GIT_TAG ${CUTLASS_REVISION}
|
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
|
|
||||||
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||||
@ -289,16 +257,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
FetchContent_MakeAvailable(cutlass)
|
FetchContent_MakeAvailable(cutlass)
|
||||||
|
|
||||||
list(APPEND VLLM_EXT_SRC
|
list(APPEND VLLM_EXT_SRC
|
||||||
|
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
||||||
|
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
|
"csrc/custom_all_reduce.cu"
|
||||||
"csrc/permute_cols.cu"
|
"csrc/permute_cols.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
|
||||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
|
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
"csrc/cutlass_extensions/common.cpp"
|
"csrc/sparse/cutlass/sparse_compressor_entry.cu"
|
||||||
"csrc/attention/mla/cutlass_mla_entry.cu")
|
"csrc/cutlass_extensions/common.cpp")
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_EXT_SRC}"
|
SRCS "${VLLM_EXT_SRC}"
|
||||||
@ -307,55 +275,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# Only build Marlin kernels if we are building for at least some compatible archs.
|
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||||
# are not supported by Machete yet.
|
# are not supported by Machete yet.
|
||||||
# 9.0 for latest bf16 atomicAdd PTX
|
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
|
|
||||||
if (MARLIN_ARCHS)
|
if (MARLIN_ARCHS)
|
||||||
|
|
||||||
#
|
|
||||||
# For the Marlin kernels we automatically generate sources for various
|
|
||||||
# preselected input type pairs and schedules.
|
|
||||||
# Generate sources:
|
|
||||||
set(MARLIN_GEN_SCRIPT
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
|
||||||
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
|
||||||
|
|
||||||
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
|
|
||||||
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
|
|
||||||
|
|
||||||
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
|
|
||||||
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
|
|
||||||
execute_process(
|
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
|
||||||
PYTHONPATH=$PYTHONPATH
|
|
||||||
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
|
|
||||||
RESULT_VARIABLE marlin_generation_result
|
|
||||||
OUTPUT_VARIABLE marlin_generation_result
|
|
||||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
|
||||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
|
||||||
)
|
|
||||||
|
|
||||||
if (NOT marlin_generation_result EQUAL 0)
|
|
||||||
message(FATAL_ERROR "Marlin generation failed."
|
|
||||||
" Result: \"${marlin_generation_result}\""
|
|
||||||
"\nCheck the log for details: "
|
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
|
|
||||||
else()
|
|
||||||
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
|
|
||||||
CACHE STRING "Last run Marlin generate script hash" FORCE)
|
|
||||||
message(STATUS "Marlin generation completed successfully.")
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
|
||||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
|
||||||
|
|
||||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
|
||||||
|
|
||||||
set(MARLIN_SRCS
|
set(MARLIN_SRCS
|
||||||
|
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||||
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
||||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||||
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
||||||
@ -372,89 +295,38 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
" in CUDA target architectures")
|
" in CUDA target architectures")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Only build AllSpark kernels if we are building for at least some compatible archs.
|
|
||||||
cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
|
|
||||||
if (ALLSPARK_ARCHS)
|
|
||||||
set(ALLSPARK_SRCS
|
|
||||||
"csrc/quantization/gptq_allspark/allspark_repack.cu"
|
|
||||||
"csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${ALLSPARK_SRCS}"
|
|
||||||
CUDA_ARCHS "${ALLSPARK_ARCHS}")
|
|
||||||
list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}")
|
|
||||||
message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
|
|
||||||
else()
|
|
||||||
message(STATUS "Not building AllSpark kernels as no compatible archs found"
|
|
||||||
" in CUDA target architectures")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
|
|
||||||
set(SCALED_MM_3X_ARCHS)
|
|
||||||
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
|
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
|
||||||
# CUDA 12.0 or later
|
# CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
||||||
set(SRCS
|
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
|
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
|
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1")
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
|
||||||
# Let scaled_mm_c2x know it doesn't need to build these arches
|
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
|
||||||
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
|
|
||||||
message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
|
|
||||||
else()
|
else()
|
||||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
||||||
message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
|
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
|
||||||
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
||||||
"later if you intend on running FP8 quantized models on "
|
"later if you intend on running FP8 quantized models on "
|
||||||
"Hopper.")
|
"Hopper.")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
|
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
|
||||||
"in CUDA target architectures")
|
"in CUDA target architectures")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
|
||||||
|
|
||||||
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
|
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
|
||||||
# CUDA 12.8 or later
|
# build any 3x kernels
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
|
set(SCALED_MM_3X_ARCHS)
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
|
||||||
set(SRCS
|
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
|
||||||
)
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${SRCS}"
|
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1")
|
|
||||||
# Let scaled_mm_c2x know it doesn't need to build these arches
|
|
||||||
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
|
|
||||||
message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
|
|
||||||
else()
|
|
||||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
|
||||||
message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
|
|
||||||
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
|
|
||||||
"later if you intend on running FP8 quantized models on "
|
|
||||||
"Blackwell.")
|
|
||||||
else()
|
|
||||||
message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
|
|
||||||
"in CUDA target architectures")
|
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
||||||
# kernels for the remaining archs that are not already built for 3x.
|
# kernels for the remaining archs that are not already built for 3x.
|
||||||
# (Build 8.9 for FP8)
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||||
"7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
|
"7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
|
||||||
# subtract out the archs that are already built for 3x
|
# subtract out the archs that are already built for 3x
|
||||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||||
if (SCALED_MM_2X_ARCHS)
|
if (SCALED_MM_2X_ARCHS)
|
||||||
@ -479,18 +351,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# 2:4 Sparse Kernels
|
# 2:4 Sparse Kernels
|
||||||
|
|
||||||
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
|
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
|
||||||
# require CUDA 12.2 or later (and only work on Hopper).
|
# require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
|
set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
|
||||||
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
|
"csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
|
||||||
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
|
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
|
||||||
else()
|
else()
|
||||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
|
||||||
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
|
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
|
||||||
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
|
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
|
||||||
"if you intend on running FP8 sparse quantized models on Hopper.")
|
"if you intend on running FP8 sparse quantized models on Hopper.")
|
||||||
@ -500,71 +372,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# FP4 Archs and flags
|
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
|
|
||||||
set(SRCS
|
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
|
||||||
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
|
|
||||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${SRCS}"
|
|
||||||
CUDA_ARCHS "${FP4_ARCHS}")
|
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
|
|
||||||
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
|
||||||
else()
|
|
||||||
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
|
||||||
# clear FP4_ARCHS
|
|
||||||
set(FP4_ARCHS)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# CUTLASS MLA Archs and flags
|
|
||||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
|
|
||||||
set(SRCS
|
|
||||||
"csrc/attention/mla/cutlass_mla_kernels.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${SRCS}"
|
|
||||||
CUDA_ARCHS "${MLA_ARCHS}")
|
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
|
|
||||||
# Add MLA-specific include directories only to MLA source files
|
|
||||||
set_source_files_properties(${SRCS}
|
|
||||||
PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
|
|
||||||
message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
|
|
||||||
else()
|
|
||||||
message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
|
|
||||||
# clear MLA_ARCHS
|
|
||||||
set(MLA_ARCHS)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# CUTLASS MoE kernels
|
|
||||||
|
|
||||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
|
||||||
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
|
|
||||||
# to compile MoE kernels that use its output.
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
|
|
||||||
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${SRCS}"
|
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
|
|
||||||
message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
|
|
||||||
else()
|
|
||||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
|
||||||
message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
|
|
||||||
"not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
|
|
||||||
"if you intend on running FP8 quantized MoE models on Hopper.")
|
|
||||||
else()
|
|
||||||
message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
|
|
||||||
"in CUDA target architectures")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Machete kernels
|
# Machete kernels
|
||||||
@ -646,8 +453,7 @@ define_gpu_extension_target(
|
|||||||
SOURCES ${VLLM_EXT_SRC}
|
SOURCES ${VLLM_EXT_SRC}
|
||||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||||
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
|
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
||||||
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
|
||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
@ -666,71 +472,28 @@ set(VLLM_MOE_EXT_SRC
|
|||||||
"csrc/moe/moe_align_sum_kernels.cu"
|
"csrc/moe/moe_align_sum_kernels.cu"
|
||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_MOE_EXT_SRC}"
|
SRCS "${VLLM_MOE_EXT_SRC}"
|
||||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
set(VLLM_MOE_WNA16_SRC
|
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
|
||||||
"csrc/moe/moe_wna16.cu")
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${VLLM_MOE_WNA16_SRC}"
|
|
||||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
|
||||||
# 9.0 for latest bf16 atomicAdd PTX
|
|
||||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
|
|
||||||
if (MARLIN_MOE_ARCHS)
|
if (MARLIN_MOE_ARCHS)
|
||||||
|
set(MARLIN_MOE_SRC
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
|
||||||
|
"csrc/moe/marlin_moe_ops.cu")
|
||||||
|
|
||||||
#
|
|
||||||
# For the Marlin MOE kernels we automatically generate sources for various
|
|
||||||
# preselected input type pairs and schedules.
|
|
||||||
# Generate sources:
|
|
||||||
set(MOE_MARLIN_GEN_SCRIPT
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
|
|
||||||
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
|
|
||||||
|
|
||||||
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
|
|
||||||
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
|
|
||||||
|
|
||||||
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
|
|
||||||
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
|
|
||||||
execute_process(
|
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
|
||||||
PYTHONPATH=$PYTHONPATH
|
|
||||||
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
|
|
||||||
RESULT_VARIABLE moe_marlin_generation_result
|
|
||||||
OUTPUT_VARIABLE moe_marlin_generation_output
|
|
||||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
|
||||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
|
||||||
)
|
|
||||||
|
|
||||||
if (NOT moe_marlin_generation_result EQUAL 0)
|
|
||||||
message(FATAL_ERROR "Marlin MOE generation failed."
|
|
||||||
" Result: \"${moe_marlin_generation_result}\""
|
|
||||||
"\nCheck the log for details: "
|
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
|
|
||||||
else()
|
|
||||||
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
|
|
||||||
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
|
|
||||||
message(STATUS "Marlin MOE generation completed successfully.")
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${MOE_WNAA16_MARLIN_SRC}"
|
SRCS "${MARLIN_MOE_SRC}"
|
||||||
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
|
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
|
||||||
|
|
||||||
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
||||||
@ -738,17 +501,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
||||||
set(MOE_PERMUTE_SRC
|
|
||||||
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
|
|
||||||
"csrc/moe/moe_permute_unpermute_op.cu")
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${MARLIN_PERMUTE_SRC}"
|
|
||||||
CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
|
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
|
|
||||||
endif()
|
|
||||||
message(STATUS "Enabling moe extension.")
|
message(STATUS "Enabling moe extension.")
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_moe_C
|
_moe_C
|
||||||
@ -757,8 +509,6 @@ define_gpu_extension_target(
|
|||||||
SOURCES ${VLLM_MOE_EXT_SRC}
|
SOURCES ${VLLM_MOE_EXT_SRC}
|
||||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||||
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
|
|
||||||
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
|
||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
@ -768,7 +518,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
#
|
#
|
||||||
set(VLLM_ROCM_EXT_SRC
|
set(VLLM_ROCM_EXT_SRC
|
||||||
"csrc/rocm/torch_bindings.cpp"
|
"csrc/rocm/torch_bindings.cpp"
|
||||||
"csrc/rocm/skinny_gemms.cu"
|
|
||||||
"csrc/rocm/attention.cu")
|
"csrc/rocm/attention.cu")
|
||||||
|
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
@ -782,8 +531,77 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# For CUDA we also build and ship some external projects.
|
# vllm-flash-attn currently only supported on CUDA
|
||||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
if (NOT VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
include(cmake/external_projects/flashmla.cmake)
|
return()
|
||||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
|
||||||
|
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
|
||||||
|
# arches in the CUDA case (and instead set the gencodes on a per file basis)
|
||||||
|
# we need to manually set VLLM_GPU_ARCHES here.
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
foreach(_ARCH ${CUDA_ARCHS})
|
||||||
|
string(REPLACE "." "" _ARCH "${_ARCH}")
|
||||||
|
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
|
||||||
|
endforeach()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Build vLLM flash attention from source
|
||||||
|
#
|
||||||
|
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
|
||||||
|
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
|
||||||
|
# They should be identical but if they aren't, this is a massive footgun.
|
||||||
|
#
|
||||||
|
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
|
||||||
|
# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
|
||||||
|
# If no component is specified, vllm-flash-attn is still installed.
|
||||||
|
|
||||||
|
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
|
||||||
|
# This is to enable local development of vllm-flash-attn within vLLM.
|
||||||
|
# It can be set as an environment variable or passed as a cmake argument.
|
||||||
|
# The environment variable takes precedence.
|
||||||
|
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
||||||
|
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(VLLM_FLASH_ATTN_SRC_DIR)
|
||||||
|
FetchContent_Declare(
|
||||||
|
vllm-flash-attn SOURCE_DIR
|
||||||
|
${VLLM_FLASH_ATTN_SRC_DIR}
|
||||||
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
|
)
|
||||||
|
else()
|
||||||
|
FetchContent_Declare(
|
||||||
|
vllm-flash-attn
|
||||||
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||||
|
GIT_TAG d4e09037abf588af1ec47d0e966b237ee376876c
|
||||||
|
GIT_PROGRESS TRUE
|
||||||
|
# Don't share the vllm-flash-attn build between build types
|
||||||
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
# Fetch the vllm-flash-attn library
|
||||||
|
FetchContent_MakeAvailable(vllm-flash-attn)
|
||||||
|
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
||||||
|
|
||||||
|
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
|
||||||
|
# case only one is built, in the case both are built redundant work is done)
|
||||||
|
install(
|
||||||
|
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||||
|
DESTINATION vllm_flash_attn
|
||||||
|
COMPONENT _vllm_fa2_C
|
||||||
|
FILES_MATCHING PATTERN "*.py"
|
||||||
|
)
|
||||||
|
|
||||||
|
install(
|
||||||
|
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||||
|
DESTINATION vllm_flash_attn
|
||||||
|
COMPONENT _vllm_fa3_C
|
||||||
|
FILES_MATCHING PATTERN "*.py"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Nothing after vllm-flash-attn, see comment about macros above
|
||||||
|
|||||||
@ -61,7 +61,7 @@ representative at an online or offline/IRL event.
|
|||||||
|
|
||||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||||
reported to the community leaders responsible for enforcement in the #code-of-conduct
|
reported to the community leaders responsible for enforcement in the #code-of-conduct
|
||||||
channel in the [vLLM Slack](https://slack.vllm.ai).
|
channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
|
||||||
All complaints will be reviewed and investigated promptly and fairly.
|
All complaints will be reviewed and investigated promptly and fairly.
|
||||||
|
|
||||||
All community leaders are obligated to respect the privacy and security of the
|
All community leaders are obligated to respect the privacy and security of the
|
||||||
@ -125,3 +125,4 @@ Community Impact Guidelines were inspired by
|
|||||||
For answers to common questions about this code of conduct, see the
|
For answers to common questions about this code of conduct, see the
|
||||||
[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
|
[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
|
||||||
[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
|
[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
|
||||||
|
|
||||||
|
|||||||
@ -1,3 +1,3 @@
|
|||||||
# Contributing to vLLM
|
# Contributing to vLLM
|
||||||
|
|
||||||
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
|
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
|
||||||
|
|||||||
@ -2,14 +2,14 @@
|
|||||||
# to run the OpenAI compatible server.
|
# to run the OpenAI compatible server.
|
||||||
|
|
||||||
# Please update any changes made here to
|
# Please update any changes made here to
|
||||||
# docs/contributing/dockerfile/dockerfile.md and
|
# docs/source/contributing/dockerfile/dockerfile.md and
|
||||||
# docs/assets/contributing/dockerfile-stages-dependency.png
|
# docs/source/assets/contributing/dockerfile-stages-dependency.png
|
||||||
|
|
||||||
ARG CUDA_VERSION=12.8.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
# prepare basic build environment
|
# prepare basic build environment
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
||||||
ARG CUDA_VERSION=12.8.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
@ -19,10 +19,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y ccache software-properties-common git curl sudo \
|
&& apt-get install -y ccache software-properties-common git curl sudo \
|
||||||
&& for i in 1 2 3; do \
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
|
||||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
|
||||||
done \
|
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||||
@ -30,14 +27,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||||
&& python3 --version && python3 -m pip --version
|
&& python3 --version && python3 -m pip --version
|
||||||
# Install uv for faster pip installs
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
python3 -m pip install uv
|
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||||
# as it was causing spam when compiling the CUTLASS kernels
|
# as it was causing spam when compiling the CUTLASS kernels
|
||||||
@ -61,23 +50,21 @@ WORKDIR /workspace
|
|||||||
# we need to install torch and torchvision from the nightly builds first,
|
# we need to install torch and torchvision from the nightly builds first,
|
||||||
# pytorch will not appear as a vLLM dependency in all of the following steps
|
# pytorch will not appear as a vLLM dependency in all of the following steps
|
||||||
# after this step
|
# after this step
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \
|
python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \
|
||||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
COPY requirements/common.txt requirements/common.txt
|
COPY requirements-common.txt requirements-common.txt
|
||||||
COPY requirements/cuda.txt requirements/cuda.txt
|
COPY requirements-cuda.txt requirements-cuda.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
uv pip install --system -r requirements/cuda.txt \
|
python3 -m pip install -r requirements-cuda.txt
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
|
|
||||||
# cuda arch list used by torch
|
# cuda arch list used by torch
|
||||||
# can be useful for both `dev` and `test`
|
# can be useful for both `dev` and `test`
|
||||||
# explicitly set the list to avoid issues with torch 2.2
|
# explicitly set the list to avoid issues with torch 2.2
|
||||||
# see https://github.com/pytorch/pytorch/pull/123243
|
# see https://github.com/pytorch/pytorch/pull/123243
|
||||||
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
|
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
|
||||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||||
# Override the arch list for flash-attn to reduce the binary size
|
# Override the arch list for flash-attn to reduce the binary size
|
||||||
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
|
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
|
||||||
@ -89,21 +76,15 @@ FROM base AS build
|
|||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
|
|
||||||
# install build dependencies
|
# install build dependencies
|
||||||
COPY requirements/build.txt requirements/build.txt
|
COPY requirements-build.txt requirements-build.txt
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
python3 -m pip install -r requirements-build.txt
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system -r requirements/build.txt \
|
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
ARG GIT_REPO_CHECK=0
|
ARG GIT_REPO_CHECK=0
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
|
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||||
|
|
||||||
# max jobs used by Ninja to build extensions
|
# max jobs used by Ninja to build extensions
|
||||||
ARG max_jobs=2
|
ARG max_jobs=2
|
||||||
@ -117,7 +98,7 @@ ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
|||||||
ARG SCCACHE_REGION_NAME=us-west-2
|
ARG SCCACHE_REGION_NAME=us-west-2
|
||||||
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
--mount=type=bind,source=.git,target=.git \
|
--mount=type=bind,source=.git,target=.git \
|
||||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||||
echo "Installing sccache..." \
|
echo "Installing sccache..." \
|
||||||
@ -137,19 +118,16 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
|
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||||
--mount=type=cache,target=/root/.cache/uv \
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
--mount=type=bind,source=.git,target=.git \
|
--mount=type=bind,source=.git,target=.git \
|
||||||
if [ "$USE_SCCACHE" != "1" ]; then \
|
if [ "$USE_SCCACHE" != "1" ]; then \
|
||||||
# Clean any existing CMake artifacts
|
|
||||||
rm -rf .deps && \
|
|
||||||
mkdir -p .deps && \
|
|
||||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
||||||
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||||
# sync the default value with .buildkite/check-wheel-size.py
|
# sync the default value with .buildkite/check-wheel-size.py
|
||||||
ARG VLLM_MAX_SIZE_MB=400
|
ARG VLLM_MAX_SIZE_MB=300
|
||||||
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
|
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
|
||||||
ARG RUN_WHEEL_CHECK=true
|
ARG RUN_WHEEL_CHECK=true
|
||||||
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
|
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
|
||||||
@ -162,35 +140,23 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
|
|||||||
#################### DEV IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
FROM base as dev
|
FROM base as dev
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
COPY requirements-lint.txt requirements-lint.txt
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
COPY requirements-test.txt requirements-test.txt
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
COPY requirements-dev.txt requirements-dev.txt
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install -r requirements-dev.txt
|
||||||
# Workaround for #17068
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
|
||||||
|
|
||||||
COPY requirements/lint.txt requirements/lint.txt
|
|
||||||
COPY requirements/test.txt requirements/test.txt
|
|
||||||
COPY requirements/dev.txt requirements/dev.txt
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system -r requirements/dev.txt \
|
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
#################### DEV IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
# image with vLLM installed
|
# image with vLLM installed
|
||||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
|
||||||
ARG CUDA_VERSION=12.8.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
WORKDIR /vllm-workspace
|
WORKDIR /vllm-workspace
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
|
|
||||||
SHELL ["/bin/bash", "-c"]
|
|
||||||
|
|
||||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||||
|
|
||||||
@ -200,10 +166,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
||||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||||
&& for i in 1 2 3; do \
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
|
||||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
|
||||||
done \
|
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
||||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||||
@ -211,14 +174,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||||
&& python3 --version && python3 -m pip --version
|
&& python3 --version && python3 -m pip --version
|
||||||
# Install uv for faster pip installs
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
python3 -m pip install uv
|
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
@ -230,61 +185,39 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
|||||||
# we need to install torch and torchvision from the nightly builds first,
|
# we need to install torch and torchvision from the nightly builds first,
|
||||||
# pytorch will not appear as a vLLM dependency in all of the following steps
|
# pytorch will not appear as a vLLM dependency in all of the following steps
|
||||||
# after this step
|
# after this step
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \
|
python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
|
||||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Install vllm wheel first, so that torch etc will be installed.
|
# Install vllm wheel first, so that torch etc will be installed.
|
||||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||||
--mount=type=cache,target=/root/.cache/uv \
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
uv pip install --system dist/*.whl --verbose \
|
python3 -m pip install dist/*.whl --verbose
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
|
|
||||||
# If we need to build FlashInfer wheel before its release:
|
# How to build this FlashInfer wheel:
|
||||||
# $ export FLASHINFER_ENABLE_AOT=1
|
# $ export FLASHINFER_ENABLE_AOT=1
|
||||||
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
|
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
|
||||||
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
|
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
|
||||||
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
|
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
|
||||||
# $ cd flashinfer
|
# $ cd flashinfer
|
||||||
# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
|
# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
|
||||||
# $ rm -rf build
|
|
||||||
# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
|
# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
|
||||||
# $ ls dist
|
|
||||||
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
. /etc/environment && \
|
. /etc/environment && \
|
||||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
||||||
# FlashInfer alreary has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
|
python3 -m pip install https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.0.post1-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
|
||||||
if [[ "$CUDA_VERSION" == 12.8* ]]; then \
|
|
||||||
uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl; \
|
|
||||||
else \
|
|
||||||
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \
|
|
||||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
|
|
||||||
if [ "$CUDA_MAJOR" -lt 12 ]; then \
|
|
||||||
export FLASHINFER_ENABLE_SM90=0; \
|
|
||||||
fi; \
|
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \
|
|
||||||
fi \
|
|
||||||
fi
|
fi
|
||||||
COPY examples examples
|
COPY examples examples
|
||||||
COPY benchmarks benchmarks
|
|
||||||
COPY ./vllm/collect_env.py .
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
# Although we build Flashinfer with AOT mode, there's still
|
||||||
. /etc/environment && \
|
|
||||||
uv pip list
|
|
||||||
|
|
||||||
# Even when we build Flashinfer with AOT mode, there's still
|
|
||||||
# some issues w.r.t. JIT compilation. Therefore we need to
|
# some issues w.r.t. JIT compilation. Therefore we need to
|
||||||
# install build dependencies for JIT compilation.
|
# install build dependencies for JIT compilation.
|
||||||
# TODO: Remove this once FlashInfer AOT wheel is fixed
|
# TODO: Remove this once FlashInfer AOT wheel is fixed
|
||||||
COPY requirements/build.txt requirements/build.txt
|
COPY requirements-build.txt requirements-build.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
uv pip install --system -r requirements/build.txt \
|
python3 -m pip install -r requirements-build.txt
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
|
|
||||||
@ -295,29 +228,17 @@ FROM vllm-base AS test
|
|||||||
|
|
||||||
ADD . /vllm-workspace/
|
ADD . /vllm-workspace/
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# install development dependencies (for testing)
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
python3 -m pip install -r requirements-dev.txt
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
# Workaround for #17068
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
|
python3 -m pip install -e tests/vllm_test_utils
|
||||||
if [ "$CUDA_MAJOR" -ge 12 ]; then \
|
|
||||||
uv pip install --system -r requirements/dev.txt; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system -e tests/vllm_test_utils
|
|
||||||
|
|
||||||
# enable fast downloads from hf (for testing)
|
# enable fast downloads from hf (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
uv pip install --system hf_transfer
|
python3 -m pip install hf_transfer
|
||||||
ENV HF_HUB_ENABLE_HF_TRANSFER 1
|
ENV HF_HUB_ENABLE_HF_TRANSFER 1
|
||||||
|
|
||||||
# Copy in the v1 package for testing (it isn't distributed yet)
|
# Copy in the v1 package for testing (it isn't distributed yet)
|
||||||
@ -328,26 +249,19 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
|
|||||||
# will not be imported by other tests
|
# will not be imported by other tests
|
||||||
RUN mkdir test_docs
|
RUN mkdir test_docs
|
||||||
RUN mv docs test_docs/
|
RUN mv docs test_docs/
|
||||||
RUN cp -r examples test_docs/
|
|
||||||
RUN mv vllm test_docs/
|
RUN mv vllm test_docs/
|
||||||
RUN mv mkdocs.yaml test_docs/
|
|
||||||
#################### TEST IMAGE ####################
|
#################### TEST IMAGE ####################
|
||||||
|
|
||||||
#################### OPENAI API SERVER ####################
|
#################### OPENAI API SERVER ####################
|
||||||
# base openai image with additional requirements, for any subsequent openai-style images
|
# base openai image with additional requirements, for any subsequent openai-style images
|
||||||
FROM vllm-base AS vllm-openai-base
|
FROM vllm-base AS vllm-openai-base
|
||||||
ARG TARGETPLATFORM
|
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
|
||||||
|
|
||||||
# install additional dependencies for openai api server
|
# install additional dependencies for openai api server
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||||
else \
|
else \
|
||||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||||
@ -26,18 +26,18 @@ WORKDIR /workspace
|
|||||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
|
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
|
||||||
pip install --upgrade pip && \
|
pip install --upgrade pip && \
|
||||||
pip install -r requirements/build.txt
|
pip install -r requirements-build.txt
|
||||||
|
|
||||||
FROM cpu-test-arm AS build
|
FROM cpu-test-arm AS build
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
|
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
|
||||||
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
|
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
|
||||||
pip install -v -r requirements/cpu.txt
|
pip install -v -r requirements-cpu.txt
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
ARG GIT_REPO_CHECK=0
|
ARG GIT_REPO_CHECK=0
|
||||||
69
Dockerfile.cpu
Normal file
69
Dockerfile.cpu
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
|
||||||
|
|
||||||
|
FROM ubuntu:22.04 AS cpu-test-1
|
||||||
|
|
||||||
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
|
|
||||||
|
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/var/cache/apt \
|
||||||
|
apt-get update -y \
|
||||||
|
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
||||||
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||||
|
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||||
|
|
||||||
|
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
||||||
|
# intel-openmp provides additional performance improvement vs. openmp
|
||||||
|
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install intel-openmp==2025.0.1
|
||||||
|
|
||||||
|
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
|
||||||
|
|
||||||
|
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||||
|
|
||||||
|
RUN pip install intel_extension_for_pytorch==2.5.0
|
||||||
|
|
||||||
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||||
|
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
|
||||||
|
pip install --upgrade pip && \
|
||||||
|
pip install -r requirements-build.txt
|
||||||
|
|
||||||
|
FROM cpu-test-1 AS build
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
|
||||||
|
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
|
||||||
|
pip install -v -r requirements-cpu.txt
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
ARG GIT_REPO_CHECK=0
|
||||||
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
|
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||||
|
|
||||||
|
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
||||||
|
ARG VLLM_CPU_DISABLE_AVX512
|
||||||
|
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
--mount=type=cache,target=/root/.cache/ccache \
|
||||||
|
--mount=type=bind,source=.git,target=.git \
|
||||||
|
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
||||||
|
pip install dist/*.whl && \
|
||||||
|
rm -rf dist
|
||||||
|
|
||||||
|
WORKDIR /workspace/
|
||||||
|
|
||||||
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
|
# install development dependencies (for testing)
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user