Compare commits
148 Commits
woosuk/rm-
...
zhuohan/re
| Author | SHA1 | Date | |
|---|---|---|---|
| f048f16ba7 | |||
| 180880ddc3 | |||
| 3e0a770c15 | |||
| 83f478bb19 | |||
| 269c4db0a4 | |||
| 52efc34ebf | |||
| d95d0f4b98 | |||
| 0402428200 | |||
| 17af6aa0da | |||
| fc168c33f3 | |||
| acc78aeb88 | |||
| 0f67d4d962 | |||
| 7e1d697b56 | |||
| 699d62e6cf | |||
| cd390b609d | |||
| 2080b05099 | |||
| 6454afec90 | |||
| 41a62564a7 | |||
| 284cc92275 | |||
| 435be10db9 | |||
| b7030d962b | |||
| 3567816932 | |||
| e0ef8a2920 | |||
| 42efe609ba | |||
| 88d3141ec6 | |||
| 09a6a49eaf | |||
| 074475541a | |||
| d4c574c39f | |||
| c528b9006a | |||
| 85fee74b33 | |||
| 8dbe0c527f | |||
| 5cc6bddb6e | |||
| 1f9460c4c1 | |||
| 70022ffc00 | |||
| f417746ad7 | |||
| 0552cfb195 | |||
| 51dd14ac2b | |||
| dbfbf9f324 | |||
| ca76486a16 | |||
| a9f55dc588 | |||
| 81d5bb765a | |||
| 0825197bee | |||
| 9ef3d5b875 | |||
| 295c7f0267 | |||
| 3fa2c12185 | |||
| fe2016de2d | |||
| 237cf6d32a | |||
| faee3ccdc2 | |||
| 570c3e1cd4 | |||
| 3a4255c7c4 | |||
| 61089465a6 | |||
| 88afa11010 | |||
| d00ce29d89 | |||
| 3b7bdf983b | |||
| 50b788a17a | |||
| fc059c7061 | |||
| bfb240cc49 | |||
| e255d92990 | |||
| 3729ed00ba | |||
| 6644796bf4 | |||
| ff93cc8c84 | |||
| 243ed7d32e | |||
| 7e0941055f | |||
| 6738e4a093 | |||
| 2566dca2a9 | |||
| b4fda58a2d | |||
| a0003b56b0 | |||
| 5beacce2ea | |||
| 8669c69afa | |||
| 1651003c35 | |||
| 1cb8c6c5fe | |||
| e05a6754a8 | |||
| 084a9dae80 | |||
| c9461e05a4 | |||
| 4dfdb821c8 | |||
| 58fab50d82 | |||
| db6f28d898 | |||
| 14e2f1231e | |||
| 7c4767f1eb | |||
| 9771e0b432 | |||
| 980de31ca0 | |||
| 1c160841ea | |||
| 4ca13a8667 | |||
| 675aa2ec64 | |||
| 3ae082c373 | |||
| 49c00fe304 | |||
| 141d3b9fc5 | |||
| abf3db40ef | |||
| 8e4ca4d14e | |||
| 1a0f4defb7 | |||
| 843af7f7fc | |||
| 1f633b8632 | |||
| a4c29e6e82 | |||
| 8f18feb191 | |||
| ed540d6d4c | |||
| f6027b2855 | |||
| ab3e80042e | |||
| ceacedc1f9 | |||
| bfa59be8f1 | |||
| 265ecb05fb | |||
| 09a7e6f617 | |||
| 6c2eef5a5d | |||
| 19748806f0 | |||
| 4a8a567e16 | |||
| 344a0017c0 | |||
| becb7de40b | |||
| 250fb1b8ea | |||
| 647214f3d5 | |||
| ddeec11ba9 | |||
| 86ed77022d | |||
| aa1356ec53 | |||
| ecc3c0940a | |||
| ba09652de2 | |||
| bd66b8529b | |||
| 6c728f7771 | |||
| 80e9452984 | |||
| c3a2c6ac5f | |||
| 72f431e709 | |||
| be4445072c | |||
| f381cf2302 | |||
| 5ff5d94e77 | |||
| f95da13c3d | |||
| aef368aa08 | |||
| 5f6cbf60d6 | |||
| 3ada34f9cb | |||
| 0eb8f2b880 | |||
| 163965d183 | |||
| a03cf9bc70 | |||
| 352c0c8a28 | |||
| bfe0b4bd2a | |||
| 58fbbcb2f5 | |||
| 87778d5f00 | |||
| f9e7ad5400 | |||
| 4d0f266113 | |||
| e93ff6c8b9 | |||
| 1c691f4a71 | |||
| 9fce7bee74 | |||
| b63f2143f8 | |||
| f32bf7582e | |||
| 8a81d776ce | |||
| f6fdacd82c | |||
| d31f7844f8 | |||
| 7a6c8c3fa1 | |||
| 221bf72577 | |||
| b3aba04e5a | |||
| 8a297115e2 | |||
| 191eed0bb9 | |||
| fb860670da |
@ -7,6 +7,7 @@ from importlib import util
|
||||
|
||||
import pandas as pd
|
||||
|
||||
pd.options.display.float_format = "{:.2f}".format
|
||||
plotly_found = util.find_spec("plotly.express") is not None
|
||||
|
||||
|
||||
@ -109,7 +110,10 @@ def compare_data_columns(
|
||||
if len(compare_frames) >= 2:
|
||||
base = compare_frames[0]
|
||||
current = compare_frames[-1]
|
||||
ratio = current / base
|
||||
if "P99" in data_column or "Median" in data_column:
|
||||
ratio = base / current # for latency
|
||||
else:
|
||||
ratio = current / base
|
||||
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
|
||||
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
||||
frames.append(ratio)
|
||||
@ -199,6 +203,71 @@ def split_json_by_tp_pp(
|
||||
return saved_paths
|
||||
|
||||
|
||||
def _add_limit_line(fig, y_value, label):
|
||||
# Visible dashed line + annotation
|
||||
fig.add_hline(
|
||||
y=y_value,
|
||||
line_dash="dash",
|
||||
line_color="red" if "ttft" in label.lower() else "blue",
|
||||
annotation_text=f"{label}: {y_value} ms",
|
||||
annotation_position="top left",
|
||||
)
|
||||
# Optional: add a legend item (as a transparent helper trace)
|
||||
if plot and plotly_found:
|
||||
import plotly.graph_objects as go
|
||||
|
||||
fig.add_trace(
|
||||
go.Scatter(
|
||||
x=[None],
|
||||
y=[None],
|
||||
mode="lines",
|
||||
line=dict(
|
||||
dash="dash", color="red" if "ttft" in label.lower() else "blue"
|
||||
),
|
||||
name=f"{label}",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _find_concurrency_col(df: pd.DataFrame) -> str:
|
||||
for c in [
|
||||
"# of max concurrency.",
|
||||
"# of max concurrency",
|
||||
"Max Concurrency",
|
||||
"max_concurrency",
|
||||
"Concurrency",
|
||||
]:
|
||||
if c in df.columns:
|
||||
return c
|
||||
# Fallback: guess an integer-like column (harmless if unused)
|
||||
for c in df.columns:
|
||||
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
|
||||
return c
|
||||
return "# of max concurrency."
|
||||
|
||||
|
||||
def _highlight_threshold(
|
||||
df: pd.DataFrame, threshold: float
|
||||
) -> "pd.io.formats.style.Styler":
|
||||
"""Highlight numeric per-configuration columns with value <= threshold."""
|
||||
conc_col = _find_concurrency_col(df)
|
||||
key_cols = [
|
||||
c
|
||||
for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
|
||||
if c in df.columns
|
||||
]
|
||||
conf_cols = [
|
||||
c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
|
||||
]
|
||||
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
|
||||
return df.style.map(
|
||||
lambda v: "background-color:#e6ffe6;font-weight:bold;"
|
||||
if pd.notna(v) and v <= threshold
|
||||
else "",
|
||||
subset=conf_cols,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
@ -220,6 +289,26 @@ if __name__ == "__main__":
|
||||
default="# of max concurrency.",
|
||||
help="column name to use as X Axis in comparison graph",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--latency",
|
||||
type=str,
|
||||
default="p99",
|
||||
help="take median|p99 for latency like TTFT/TPOT",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ttft-max-ms",
|
||||
type=float,
|
||||
default=3000.0,
|
||||
help="Reference limit for TTFT plots (ms)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tpot-max-ms",
|
||||
type=float,
|
||||
default=100.0,
|
||||
help="Reference limit for TPOT plots (ms)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
drop_column = "P99"
|
||||
@ -234,12 +323,22 @@ if __name__ == "__main__":
|
||||
"# of max concurrency.",
|
||||
"qps",
|
||||
]
|
||||
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
||||
html_msgs_for_data_cols = [
|
||||
"Compare Output Tokens /n",
|
||||
"Median TTFT /n",
|
||||
"Median TPOT /n",
|
||||
]
|
||||
|
||||
if "median" in args.latency:
|
||||
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
||||
html_msgs_for_data_cols = [
|
||||
"Compare Output Tokens /n",
|
||||
"Median TTFT /n",
|
||||
"Median TPOT /n",
|
||||
]
|
||||
drop_column = "P99"
|
||||
elif "p99" in args.latency:
|
||||
data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
|
||||
html_msgs_for_data_cols = [
|
||||
"Compare Output Tokens /n",
|
||||
"P99 TTFT /n",
|
||||
"P99 TPOT /n",
|
||||
]
|
||||
|
||||
if len(args.file) == 1:
|
||||
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
||||
@ -275,33 +374,83 @@ if __name__ == "__main__":
|
||||
f"Expected subset: {filtered_info_cols}, "
|
||||
f"but DataFrame has: {list(output_df.columns)}"
|
||||
)
|
||||
output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
||||
# output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
||||
output_df_sorted = output_df.sort_values(by=args.xaxis)
|
||||
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
||||
for name, group in output_groups:
|
||||
html = group.to_html()
|
||||
group_name = (
|
||||
",".join(map(str, name)).replace(",", "_").replace("/", "-")
|
||||
)
|
||||
group_html_name = "perf_comparison_" + group_name + ".html"
|
||||
|
||||
metric_name = str(data_cols_to_compare[i]).lower()
|
||||
if "tok/s" in metric_name:
|
||||
html = group.to_html()
|
||||
elif "ttft" in metric_name:
|
||||
styler = _highlight_threshold(group, args.ttft_max_ms).format(
|
||||
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
||||
na_rep="—",
|
||||
)
|
||||
html = styler.to_html(
|
||||
table_attributes='border="1" class="dataframe"'
|
||||
)
|
||||
elif (
|
||||
"tpot" in metric_name
|
||||
or "median" in metric_name
|
||||
or "p99" in metric_name
|
||||
):
|
||||
styler = _highlight_threshold(group, args.tpot_max_ms).format(
|
||||
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
||||
na_rep="—",
|
||||
)
|
||||
html = styler.to_html(
|
||||
table_attributes='border="1" class="dataframe"'
|
||||
)
|
||||
|
||||
text_file.write(html_msgs_for_data_cols[i])
|
||||
text_file.write(html)
|
||||
with open(group_html_name, "a+") as sub_text_file:
|
||||
sub_text_file.write(html_msgs_for_data_cols[i])
|
||||
sub_text_file.write(html)
|
||||
|
||||
if plot and plotly_found:
|
||||
import plotly.express as px
|
||||
if plot and plotly_found:
|
||||
import plotly.express as px
|
||||
|
||||
df = group[raw_data_cols]
|
||||
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
||||
# Melt DataFrame for plotting
|
||||
df_melted = df_sorted.melt(
|
||||
id_vars=info_cols[y_axis_index],
|
||||
var_name="Configuration",
|
||||
value_name=data_cols_to_compare[i],
|
||||
)
|
||||
title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
||||
# Create Plotly line chart
|
||||
fig = px.line(
|
||||
df_melted,
|
||||
x=info_cols[y_axis_index],
|
||||
y=data_cols_to_compare[i],
|
||||
color="Configuration",
|
||||
title=title,
|
||||
markers=True,
|
||||
)
|
||||
# Export to HTML
|
||||
text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
|
||||
df = group[raw_data_cols]
|
||||
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
||||
# Melt DataFrame for plotting
|
||||
df_melted = df_sorted.melt(
|
||||
id_vars=info_cols[y_axis_index],
|
||||
var_name="Configuration",
|
||||
value_name=data_cols_to_compare[i],
|
||||
)
|
||||
title = (
|
||||
data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
||||
)
|
||||
# Create Plotly line chart
|
||||
fig = px.line(
|
||||
df_melted,
|
||||
x=info_cols[y_axis_index],
|
||||
y=data_cols_to_compare[i],
|
||||
color="Configuration",
|
||||
title=title,
|
||||
markers=True,
|
||||
)
|
||||
|
||||
# ---- Add threshold lines based on metric name ----
|
||||
if "ttft" in metric_name:
|
||||
_add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
|
||||
elif (
|
||||
"tpot" in metric_name
|
||||
or "median" in metric_name
|
||||
or "p99" in metric_name
|
||||
):
|
||||
_add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
|
||||
|
||||
# Export to HTML
|
||||
text_file.write(
|
||||
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
||||
)
|
||||
sub_text_file.write(
|
||||
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
||||
)
|
||||
|
||||
@ -63,9 +63,11 @@ serving_column_mapping = {
|
||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||
"median_ttft_ms": "Median TTFT (ms)",
|
||||
"p99_ttft_ms": "P99 TTFT (ms)",
|
||||
"std_ttft_ms": "STD TTFT (ms)",
|
||||
"mean_tpot_ms": "Mean TPOT (ms)",
|
||||
"median_tpot_ms": "Median",
|
||||
"p99_tpot_ms": "P99",
|
||||
"std_tpot_ms": "STD TPOT (ms)",
|
||||
"mean_itl_ms": "Mean ITL (ms)",
|
||||
"median_itl_ms": "Median ITL (ms)",
|
||||
"p99_itl_ms": "P99 ITL (ms)",
|
||||
@ -368,7 +370,7 @@ if __name__ == "__main__":
|
||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||
# we want to turn it into "8xGPUTYPE"
|
||||
df["GPU"] = df["GPU"].apply(
|
||||
lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
|
||||
lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
|
||||
)
|
||||
|
||||
# get markdown tables
|
||||
|
||||
@ -471,6 +471,11 @@ main() {
|
||||
mkdir -p $RESULTS_FOLDER
|
||||
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||
|
||||
# dump vllm info via vllm collect-env
|
||||
env_output=$(vllm collect-env)
|
||||
|
||||
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
|
||||
|
||||
# benchmarking
|
||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
||||
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
|
||||
|
||||
@ -1,28 +1,24 @@
|
||||
[
|
||||
{
|
||||
"test_name": "latency_llama8B_tp1",
|
||||
"test_name": "latency_llama8B_tp2",
|
||||
"environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"num_iters_warmup": 5,
|
||||
"num_iters": 15
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_llama8B_tp4",
|
||||
"environment_variables": {
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"num_iters_warmup": 5,
|
||||
"num_iters": 15
|
||||
}
|
||||
|
||||
@ -95,6 +95,38 @@
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
@ -233,6 +265,41 @@
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
@ -365,6 +432,38 @@
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
@ -503,6 +602,41 @@
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
@ -638,6 +772,39 @@
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
@ -780,6 +947,42 @@
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -28,13 +28,13 @@
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -60,13 +60,13 @@
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_sharegpt",
|
||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -76,39 +76,7 @@
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_1024_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
@ -124,16 +92,16 @@
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 1024,
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 100
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_pp6_random_1024_128",
|
||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -143,7 +111,7 @@
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"pipeline_parallel_size": 6,
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
@ -159,10 +127,150 @@
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 1024,
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 100
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
@ -1,29 +1,24 @@
|
||||
[
|
||||
{
|
||||
"test_name": "throughput_llama8B_tp1",
|
||||
"test_name": "throughput_llama8B_tp2",
|
||||
"environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200,
|
||||
"backend": "vllm"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_llama8B_tp4",
|
||||
"environment_variables": {
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200,
|
||||
"backend": "vllm"
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
steps:
|
||||
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
||||
# aarch64 + CUDA builds
|
||||
- label: "Build arm64 wheel - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-12-9
|
||||
@ -15,6 +15,21 @@ steps:
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# aarch64 build
|
||||
- label: "Build arm64 CPU wheel"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cpu
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 + CUDA builds
|
||||
- label: "Build wheel - CUDA 12.8"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-8
|
||||
@ -28,20 +43,6 @@ steps:
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - CUDA 12.6"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-6
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 + CUDA builds
|
||||
- label: "Build wheel - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-9
|
||||
@ -55,6 +56,20 @@ steps:
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-13-0
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# Build release images (12.9)
|
||||
- label: "Build release image (x86)"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86
|
||||
@ -62,13 +77,12 @@ steps:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
# re-tag to default image tag and push, just in case arm64 build fails
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
# PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
||||
- label: "Build release image (arm64)"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64
|
||||
@ -142,6 +156,22 @@ steps:
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build arm64 CPU release image"
|
||||
key: block-arm64-cpu-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build and publish arm64 CPU release image"
|
||||
depends_on: block-arm64-cpu-release-image-build
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
|
||||
@ -58,33 +58,25 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
|
||||
if [[ $normal_wheel == *"cu126"* ]]; then
|
||||
# if $normal_wheel matches cu126, do not upload the index.html
|
||||
echo "Skipping index files for cu126 wheels"
|
||||
elif [[ $normal_wheel == *"cu128"* ]]; then
|
||||
# if $normal_wheel matches cu128, do not upload the index.html
|
||||
echo "Skipping index files for cu128 wheels"
|
||||
else
|
||||
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||
# only upload index.html for cu129 wheels (default wheels) as it
|
||||
# is available on both x86 and arm64
|
||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||
else
|
||||
echo "Skipping index files for non-cu129 wheels"
|
||||
fi
|
||||
|
||||
# generate index for nightly
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||
|
||||
if [[ $normal_wheel == *"cu126"* ]]; then
|
||||
# if $normal_wheel matches cu126, do not upload the index.html
|
||||
echo "Skipping index files for cu126 wheels"
|
||||
elif [[ $normal_wheel == *"cu128"* ]]; then
|
||||
# if $normal_wheel matches cu128, do not upload the index.html
|
||||
echo "Skipping index files for cu128 wheels"
|
||||
else
|
||||
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||
# only upload index.html for cu129 wheels (default wheels) as it
|
||||
# is available on both x86 and arm64
|
||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||
else
|
||||
echo "Skipping index files for non-cu129 wheels"
|
||||
fi
|
||||
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||
|
||||
@ -50,7 +50,7 @@ steps:
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test # 36min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
@ -395,7 +395,9 @@ steps:
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
#- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
|
||||
- label: Platform Tests (CUDA) # 4min
|
||||
timeout_in_minutes: 15
|
||||
@ -436,7 +438,11 @@ steps:
|
||||
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
||||
--ignore=lora/test_chatglm3_tp.py \
|
||||
--ignore=lora/test_llama_tp.py \
|
||||
--ignore=lora/test_llm_with_multi_loras.py
|
||||
--ignore=lora/test_llm_with_multi_loras.py \
|
||||
--ignore=lora/test_olmoe_tp.py \
|
||||
--ignore=lora/test_deepseekv2_tp.py \
|
||||
--ignore=lora/test_gptoss.py \
|
||||
--ignore=lora/test_qwen3moe_tp.py
|
||||
parallelism: 4
|
||||
|
||||
- label: PyTorch Compilation Unit Tests # 15min
|
||||
@ -454,8 +460,8 @@ steps:
|
||||
- pytest -v -s compile/test_fusion_attn.py
|
||||
- pytest -v -s compile/test_functionalization.py
|
||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||
- pytest -v -s compile/test_sequence_parallelism.py
|
||||
- pytest -v -s compile/test_async_tp.py
|
||||
# - pytest -v -s compile/test_sequence_parallelism.py
|
||||
# - pytest -v -s compile/test_async_tp.py
|
||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s compile/test_decorator.py
|
||||
- pytest -v -s compile/test_noop_elimination.py
|
||||
@ -474,8 +480,8 @@ steps:
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s compile/piecewise/
|
||||
|
||||
- label: PyTorch Fullgraph Test # 20min
|
||||
timeout_in_minutes: 30
|
||||
- label: PyTorch Fullgraph Test # 22min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
@ -485,6 +491,7 @@ steps:
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_full_graph.py
|
||||
- pytest -v -s compile/test_fusions_e2e.py
|
||||
|
||||
- label: Kernels Core Operation Test # 48min
|
||||
timeout_in_minutes: 75
|
||||
@ -494,6 +501,7 @@ steps:
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- tests/kernels/core
|
||||
- tests/kernels/test_top_k_per_row.py
|
||||
commands:
|
||||
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
||||
|
||||
@ -606,7 +614,7 @@ steps:
|
||||
# we can only upgrade after this is resolved
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
timeout_in_minutes: 75
|
||||
@ -848,6 +856,18 @@ steps:
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
timeout_in_minutes: 70
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- vllm/multimodal/
|
||||
- vllm/inputs/
|
||||
- vllm/v1/core/
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
@ -923,8 +943,8 @@ steps:
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
- label: Blackwell Test # 38 min
|
||||
timeout_in_minutes: 60
|
||||
- label: Blackwell Test # 21 min
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
# optional: true
|
||||
@ -937,8 +957,6 @@ steps:
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/fusion.py
|
||||
- vllm/compilation/fusion_attn.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
@ -955,13 +973,32 @@ steps:
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
# Fusion
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
|
||||
- label: Blackwell Fusion Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
@ -1081,6 +1118,7 @@ steps:
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||
@ -1128,6 +1166,11 @@ steps:
|
||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||
- pip uninstall prithvi_io_processor_plugin -y
|
||||
# end io_processor plugins test
|
||||
# begin stat_logger plugins test
|
||||
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
|
||||
- pip uninstall dummy_stat_logger -y
|
||||
# end stat_logger plugins test
|
||||
# other tests continue here:
|
||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
@ -1171,7 +1214,7 @@ steps:
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
timeout_in_minutes: 45
|
||||
@ -1201,6 +1244,18 @@ steps:
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||
|
||||
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||
|
||||
##### multi gpus test #####
|
||||
##### A100 test #####
|
||||
@ -1232,12 +1287,16 @@ steps:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distrubted Tests (H200) # optional
|
||||
- label: Distributed Tests (H200) # optional
|
||||
gpu: h200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- pytest -v -s tests/compile/test_async_tp.py
|
||||
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
|
||||
|
||||
@ -172,6 +172,8 @@ steps:
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
- tests/distributed/test_symm_mem_allreduce.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
# test with torchrun tp=2 and external_dp=2
|
||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with torchrun tp=2 and pp=2
|
||||
@ -349,7 +351,8 @@ steps:
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
|
||||
- label: Platform Tests (CUDA) # 4min
|
||||
timeout_in_minutes: 15
|
||||
@ -384,7 +387,12 @@ steps:
|
||||
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
||||
--ignore=lora/test_chatglm3_tp.py \
|
||||
--ignore=lora/test_llama_tp.py \
|
||||
--ignore=lora/test_llm_with_multi_loras.py
|
||||
--ignore=lora/test_llm_with_multi_loras.py \
|
||||
--ignore=lora/test_olmoe_tp.py \
|
||||
--ignore=lora/test_deepseekv2_tp.py \
|
||||
--ignore=lora/test_gptoss.py \
|
||||
--ignore=lora/test_qwen3moe_tp.py
|
||||
|
||||
parallelism: 4
|
||||
|
||||
- label: PyTorch Compilation Unit Tests # 15min
|
||||
@ -529,7 +537,7 @@ steps:
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0
|
||||
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
@ -970,6 +978,8 @@ steps:
|
||||
- tests/v1/shutdown
|
||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
@ -977,6 +987,7 @@ steps:
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||
@ -1064,6 +1075,7 @@ steps:
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
|
||||
9
.github/CODEOWNERS
vendored
9
.github/CODEOWNERS
vendored
@ -5,8 +5,8 @@
|
||||
/vllm/attention @LucasWilkinson
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/model_executor/layers/fused_moe @mgoin
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
||||
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
||||
/vllm/model_executor/layers/mamba @tdoublep
|
||||
/vllm/model_executor/model_loader @22quinn
|
||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||
@ -25,7 +25,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
|
||||
# vLLM V1
|
||||
/vllm/v1/attention @LucasWilkinson
|
||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
||||
/vllm/v1/attention/backends/mla @pavanimajety
|
||||
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
|
||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||
/vllm/v1/sample @22quinn @houseroad @njhill
|
||||
@ -44,7 +45,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
||||
/tests/models @DarkLight1337 @ywang96
|
||||
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
|
||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
|
||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@ -94,6 +94,9 @@ ipython_config.py
|
||||
# generated files
|
||||
**/generated/**
|
||||
|
||||
# uv
|
||||
uv.lock
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
|
||||
@ -38,7 +38,7 @@ repos:
|
||||
rev: 0.9.1
|
||||
hooks:
|
||||
- id: pip-compile
|
||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
|
||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
|
||||
files: ^requirements/test\.(in|txt)$
|
||||
- repo: local
|
||||
hooks:
|
||||
@ -48,8 +48,8 @@ repos:
|
||||
entry: python tools/generate_nightly_torch_test.py
|
||||
files: ^requirements/test\.(in|txt)$
|
||||
- id: mypy-local
|
||||
name: Run mypy for local Python installation
|
||||
entry: python tools/pre_commit/mypy.py 0 "local"
|
||||
name: Run mypy locally for lowest supported Python version
|
||||
entry: python tools/pre_commit/mypy.py 0 "3.10"
|
||||
stages: [pre-commit] # Don't run in CI
|
||||
<<: &mypy_common
|
||||
language: python
|
||||
|
||||
@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||
# versions are derived from docker/Dockerfile.rocm
|
||||
#
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
|
||||
|
||||
#
|
||||
# Try to find python package with an executable that exactly matches
|
||||
@ -883,6 +883,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
||||
set(VLLM_MOE_EXT_SRC
|
||||
"csrc/moe/torch_bindings.cpp"
|
||||
"csrc/moe/moe_align_sum_kernels.cu"
|
||||
"csrc/moe/moe_lora_align_sum_kernels.cu"
|
||||
"csrc/moe/topk_softmax_kernels.cu")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
@ -188,16 +188,47 @@ else()
|
||||
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
|
||||
# Flag to enable ACL kernels for AARCH64 platforms
|
||||
if (VLLM_BUILD_ACL STREQUAL "ON")
|
||||
set(USE_ACL ON)
|
||||
else()
|
||||
set(USE_ACL OFF)
|
||||
endif()
|
||||
|
||||
# Build oneDNN for GEMM kernels (only for x86-AVX512 /ARM platforms)
|
||||
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||
# Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
|
||||
# TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
|
||||
if(ASIMD_FOUND)
|
||||
if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
|
||||
message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
|
||||
else()
|
||||
message(STATUS "Downloading Arm Compute Library (ACL) from GitHub")
|
||||
FetchContent_Populate(arm_compute
|
||||
SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
|
||||
SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
|
||||
GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
|
||||
GIT_TAG v52.2.0
|
||||
GIT_SHALLOW TRUE
|
||||
GIT_PROGRESS TRUE
|
||||
)
|
||||
set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
|
||||
endif()
|
||||
|
||||
# Build ACL with scons
|
||||
include(ProcessorCount)
|
||||
ProcessorCount(_NPROC)
|
||||
execute_process(
|
||||
COMMAND scons -j${_NPROC}
|
||||
Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
|
||||
arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
|
||||
multi_isa=1 openmp=1 cppthreads=0
|
||||
WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
|
||||
RESULT_VARIABLE _acl_rc
|
||||
)
|
||||
if(NOT _acl_rc EQUAL 0)
|
||||
message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
|
||||
endif()
|
||||
|
||||
set(ONEDNN_AARCH64_USE_ACL "ON")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
|
||||
add_compile_definitions(VLLM_USE_ACL)
|
||||
endif()
|
||||
|
||||
set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
|
||||
|
||||
if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
|
||||
@ -217,16 +248,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
)
|
||||
endif()
|
||||
|
||||
if(USE_ACL)
|
||||
find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
|
||||
if(NOT ARM_COMPUTE_LIBRARY)
|
||||
message(FATAL_ERROR "Could not find ARM Compute Library: please set ACL_ROOT_DIR")
|
||||
endif()
|
||||
set(ONEDNN_AARCH64_USE_ACL "ON")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
|
||||
add_compile_definitions(VLLM_USE_ACL)
|
||||
endif()
|
||||
|
||||
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
||||
set(ONEDNN_BUILD_DOC "OFF")
|
||||
set(ONEDNN_BUILD_EXAMPLES "OFF")
|
||||
|
||||
@ -19,7 +19,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
flashmla
|
||||
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
|
||||
GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
|
||||
GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95
|
||||
GIT_PROGRESS TRUE
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
@ -66,6 +66,7 @@ if(FLASH_MLA_ARCHS)
|
||||
${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
|
||||
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
|
||||
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_metadata.cu
|
||||
)
|
||||
|
||||
set(FlashMLA_INCLUDES
|
||||
|
||||
@ -187,7 +187,8 @@ template <>
|
||||
struct hash<MatMulPrimitiveHandler::ClassMatmulCacheKey> {
|
||||
size_t operator()(
|
||||
const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
|
||||
return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size);
|
||||
return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size) ^
|
||||
hash<int>()(static_cast<int>(val.b_type));
|
||||
}
|
||||
};
|
||||
|
||||
@ -216,7 +217,8 @@ bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,
|
||||
|
||||
bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
|
||||
const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
|
||||
return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size;
|
||||
return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size &&
|
||||
l.b_type == r.b_type;
|
||||
}
|
||||
|
||||
bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l,
|
||||
@ -493,8 +495,10 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
|
||||
dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
|
||||
const MSizeCacheKey& key) {
|
||||
if (m_size_cache_.get() == nullptr) {
|
||||
ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_};
|
||||
m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_);
|
||||
ClassMatmulCacheKey class_key = {
|
||||
.b_n_size = b_n_size_, .b_k_size = b_k_size_, .b_type = b_type_};
|
||||
m_size_cache_ =
|
||||
get_matul_class_primitive_cache(class_key, primitive_cache_size_);
|
||||
}
|
||||
return m_size_cache_->get_or_create(key, [&]() {
|
||||
dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
|
||||
|
||||
@ -199,6 +199,7 @@ class MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
|
||||
struct ClassMatmulCacheKey {
|
||||
dnnl_dim_t b_n_size;
|
||||
dnnl_dim_t b_k_size;
|
||||
dnnl::memory::data_type b_type;
|
||||
|
||||
friend bool operator==(const ClassMatmulCacheKey& l,
|
||||
const ClassMatmulCacheKey& r);
|
||||
|
||||
169
csrc/moe/moe_lora_align_sum_kernels.cu
Normal file
169
csrc/moe/moe_lora_align_sum_kernels.cu
Normal file
@ -0,0 +1,169 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
#include <torch/all.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/cuda/Atomic.cuh>
|
||||
|
||||
#include "../cuda_compat.h"
|
||||
#include "../dispatch_utils.h"
|
||||
#include "core/math.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
|
||||
int32_t col) {
|
||||
return row * total_col + col;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// TODO: Refactor common parts with moe_align_sum_kernels
|
||||
template <typename scalar_t, typename token_cnts_t>
|
||||
__global__ void moe_lora_align_sum_kernel(
|
||||
scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
|
||||
int64_t block_size, int num_experts, int max_loras, size_t numel,
|
||||
int max_num_tokens_padded, int max_num_m_blocks,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||
int topk_num, int32_t* total_tokens_post_pad) {
|
||||
const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
|
||||
const size_t start_idx = threadIdx.x * tokens_per_thread;
|
||||
|
||||
int lora_id = blockIdx.x;
|
||||
extern __shared__ int32_t shared_mem[];
|
||||
int32_t* cumsum = shared_mem;
|
||||
token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);
|
||||
|
||||
// Initialize sorted_token_ids with numel
|
||||
for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
|
||||
sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel;
|
||||
}
|
||||
|
||||
// Initialize expert_ids with -1
|
||||
for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) {
|
||||
expert_ids[lora_id * max_num_m_blocks + it] = -1;
|
||||
}
|
||||
|
||||
// Initialize total_tokens_post_pad with 0
|
||||
if (threadIdx.x == 0) {
|
||||
total_tokens_post_pad[lora_id] = 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_experts; ++i) {
|
||||
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
|
||||
}
|
||||
|
||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||
int mask = token_lora_mapping[i / topk_num] == lora_id;
|
||||
int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]);
|
||||
tokens_cnts[idx] += mask;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// For each expert we accumulate the token counts from the different threads.
|
||||
if (threadIdx.x < num_experts) {
|
||||
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
|
||||
for (int i = 1; i <= blockDim.x; ++i) {
|
||||
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
|
||||
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// We accumulate the token counts of all experts in thread 0.
|
||||
if (threadIdx.x == 0) {
|
||||
cumsum[0] = 0;
|
||||
for (int i = 1; i <= num_experts; ++i) {
|
||||
cumsum[i] = cumsum[i - 1] +
|
||||
div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
|
||||
block_size) *
|
||||
block_size;
|
||||
}
|
||||
total_tokens_post_pad[lora_id] = static_cast<int32_t>(cumsum[num_experts]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
/**
|
||||
* For each expert, each thread processes the tokens of the corresponding
|
||||
* blocks and stores the corresponding expert_id for each block.
|
||||
*/
|
||||
if (threadIdx.x < num_experts) {
|
||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||
i += block_size) {
|
||||
expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] =
|
||||
threadIdx.x;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
/** The cumsum[expert_id] stores the starting index of the tokens that the
|
||||
* expert with expert_id needs to process, and
|
||||
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
|
||||
* processed by the expert with expert_id within the current thread's token
|
||||
* shard.
|
||||
*/
|
||||
int32_t rank_post_pad =
|
||||
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
|
||||
cumsum[expert_id];
|
||||
|
||||
int mask = (int)token_lora_mapping[i / topk_num] == lora_id;
|
||||
atomicAdd(
|
||||
&sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)],
|
||||
(i - numel) * mask);
|
||||
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask;
|
||||
}
|
||||
}
|
||||
|
||||
void moe_lora_align_block_size(torch::Tensor topk_ids,
|
||||
torch::Tensor token_lora_mapping,
|
||||
int64_t num_experts, int64_t block_size,
|
||||
int64_t max_loras, int64_t max_num_tokens_padded,
|
||||
int64_t max_num_m_blocks,
|
||||
torch::Tensor sorted_token_ids,
|
||||
torch::Tensor expert_ids,
|
||||
torch::Tensor num_tokens_post_pad) {
|
||||
const int topk_num = topk_ids.size(1);
|
||||
|
||||
TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
|
||||
|
||||
int device_max_shared_mem;
|
||||
auto dev = topk_ids.get_device();
|
||||
cudaDeviceGetAttribute(&device_max_shared_mem,
|
||||
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
const int32_t num_thread = max((int32_t)num_experts, 128); // WARP_SIZE,
|
||||
TORCH_CHECK(num_thread <= 1024,
|
||||
"num_thread must be less than 1024, "
|
||||
"and fallback is not implemented yet.");
|
||||
const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) +
|
||||
(num_experts + 1) * sizeof(int32_t);
|
||||
|
||||
if (shared_mem > device_max_shared_mem) {
|
||||
TORCH_CHECK(false,
|
||||
"Shared memory usage exceeds device limit, and global memory "
|
||||
"fallback is not implemented yet.");
|
||||
}
|
||||
|
||||
VLLM_DISPATCH_INTEGRAL_TYPES(
|
||||
topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
|
||||
dim3 blockDim(num_thread);
|
||||
auto kernel = moe_lora_align_sum_kernel<scalar_t, int32_t>;
|
||||
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
|
||||
(void*)kernel, shared_mem));
|
||||
kernel<<<max_loras, blockDim, shared_mem, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
token_lora_mapping.data_ptr<int32_t>(), block_size, num_experts,
|
||||
max_loras, topk_ids.numel(), max_num_tokens_padded,
|
||||
max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
|
||||
expert_ids.data_ptr<int32_t>(), topk_num,
|
||||
num_tokens_post_pad.data_ptr<int32_t>());
|
||||
});
|
||||
}
|
||||
@ -20,6 +20,14 @@ void batched_moe_align_block_size(int64_t max_tokens_per_batch,
|
||||
torch::Tensor expert_ids,
|
||||
torch::Tensor num_tokens_post_pad);
|
||||
|
||||
void moe_lora_align_block_size(torch::Tensor topk_ids,
|
||||
torch::Tensor token_lora_mapping,
|
||||
int64_t num_experts, int64_t block_size,
|
||||
int64_t max_loras, int64_t max_num_tokens_padded,
|
||||
int64_t max_num_m_blocks,
|
||||
torch::Tensor sorted_token_ids,
|
||||
torch::Tensor expert_ids,
|
||||
torch::Tensor num_tokens_post_pad);
|
||||
#ifndef USE_ROCM
|
||||
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
||||
torch::Tensor b_qweight, torch::Tensor b_scales,
|
||||
|
||||
@ -33,6 +33,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
m.impl("batched_moe_align_block_size", torch::kCUDA,
|
||||
&batched_moe_align_block_size);
|
||||
|
||||
// Aligning the number of tokens to be processed by each expert such
|
||||
// that it is divisible by the block size.
|
||||
m.def(
|
||||
"moe_lora_align_block_size(Tensor topk_ids,"
|
||||
" Tensor token_lora_mapping,"
|
||||
" int num_experts,"
|
||||
" int block_size, int max_loras, "
|
||||
" int max_num_tokens_padded, "
|
||||
" int max_num_m_blocks, "
|
||||
" Tensor !sorted_token_ids,"
|
||||
" Tensor !experts_ids,"
|
||||
" Tensor !num_tokens_post_pad) -> () ");
|
||||
m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
m.def(
|
||||
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
|
||||
|
||||
@ -99,8 +99,11 @@ void apply_repetition_penalties_(torch::Tensor& logits,
|
||||
|
||||
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
|
||||
const torch::Tensor& rowEnds, torch::Tensor& indices,
|
||||
torch::Tensor& values, int64_t numRows, int64_t stride0,
|
||||
int64_t stride1);
|
||||
int64_t numRows, int64_t stride0, int64_t stride1);
|
||||
|
||||
void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
|
||||
const torch::Tensor& seq_lens, torch::Tensor& indices,
|
||||
int64_t numRows, int64_t stride0, int64_t stride1);
|
||||
|
||||
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
||||
torch::Tensor& weight, torch::Tensor& scale,
|
||||
@ -304,7 +307,7 @@ void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
|
||||
torch::Tensor b_gptq_qzeros,
|
||||
torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
|
||||
bool use_exllama, int64_t bit);
|
||||
bool use_exllama, bool use_v2_format, int64_t bit);
|
||||
|
||||
void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
|
||||
|
||||
|
||||
@ -185,7 +185,7 @@ typedef void (*fp_gemm_half_q_half_gptq_kernel)(const half*, const uint32_t*,
|
||||
const uint32_t*, const half*,
|
||||
half*, const int, const int,
|
||||
const int, const int,
|
||||
const int*);
|
||||
const bool, const int*);
|
||||
|
||||
template <bool first_block, int m_count>
|
||||
__global__ void gemm_half_q_half_gptq_4bit_kernel(
|
||||
@ -193,12 +193,15 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
|
||||
const uint32_t* __restrict__ b_gptq_qzeros,
|
||||
const half* __restrict__ b_gptq_scales, half* __restrict__ c,
|
||||
const int size_m, const int size_n, const int size_k, const int groups,
|
||||
const int* __restrict__ b_q_perm) {
|
||||
const bool use_v2_format, const int* __restrict__ b_q_perm) {
|
||||
MatrixView_half a_(a, size_m, size_k);
|
||||
MatrixView_half_rw c_(c, size_m, size_n);
|
||||
MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
auto t = threadIdx.x;
|
||||
|
||||
// Block
|
||||
@ -256,10 +259,10 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
|
||||
half2 y1y16[4][2];
|
||||
b_gptq_qzeros_.item4(zeros, group, n);
|
||||
b_gptq_scales_.item4_f(scales, group, n);
|
||||
dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
|
||||
dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
|
||||
dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
|
||||
dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
|
||||
dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
|
||||
dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
|
||||
dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
|
||||
dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
|
||||
|
||||
// Column result
|
||||
float block_c[m_count][4] = {};
|
||||
@ -272,10 +275,10 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
|
||||
nextgroup += groupsize;
|
||||
b_gptq_qzeros_.item4(zeros, group, n);
|
||||
b_gptq_scales_.item4_f(scales, group, n);
|
||||
dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
|
||||
dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
|
||||
dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
|
||||
dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
|
||||
dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
|
||||
dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
|
||||
dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
|
||||
dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
@ -329,12 +332,15 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
|
||||
const uint32_t* __restrict__ b_gptq_qzeros,
|
||||
const half* __restrict__ b_gptq_scales, half* __restrict__ c,
|
||||
const int size_m, const int size_n, const int size_k, const int groups,
|
||||
const int* __restrict__ b_q_perm) {
|
||||
const bool use_v2_format, const int* __restrict__ b_q_perm) {
|
||||
MatrixView_half a_(a, size_m, size_k);
|
||||
MatrixView_half_rw c_(c, size_m, size_n);
|
||||
MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
auto t = threadIdx.x;
|
||||
|
||||
// Block
|
||||
@ -409,10 +415,10 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
|
||||
int4 load_int4 = *b_ptr4;
|
||||
|
||||
half2 dq[4][8];
|
||||
dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
|
||||
dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
|
||||
dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
|
||||
dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
|
||||
dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + zero_offset);
|
||||
dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + zero_offset);
|
||||
dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + zero_offset);
|
||||
dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + zero_offset);
|
||||
|
||||
#pragma unroll
|
||||
for (int m = 0; m < m_count; m++) {
|
||||
@ -448,12 +454,15 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
|
||||
const uint32_t* __restrict__ b_gptq_qzeros,
|
||||
const half* __restrict__ b_gptq_scales, half* __restrict__ c,
|
||||
const int size_m, const int size_n, const int size_k, const int groups,
|
||||
const int* __restrict__ b_q_perm) {
|
||||
const bool use_v2_format, const int* __restrict__ b_q_perm) {
|
||||
MatrixView_half a_(a, size_m, size_k);
|
||||
MatrixView_half_rw c_(c, size_m, size_n);
|
||||
MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
auto t = threadIdx.x;
|
||||
|
||||
// Block
|
||||
@ -534,13 +543,13 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
|
||||
|
||||
half2 dq[4][16];
|
||||
dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0],
|
||||
size_n, zeros[0] + 1);
|
||||
size_n, zeros[0] + zero_offset);
|
||||
dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1],
|
||||
size_n, zeros[1] + 1);
|
||||
size_n, zeros[1] + zero_offset);
|
||||
dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2],
|
||||
size_n, zeros[2] + 1);
|
||||
size_n, zeros[2] + zero_offset);
|
||||
dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3],
|
||||
size_n, zeros[3] + 1);
|
||||
size_n, zeros[3] + zero_offset);
|
||||
|
||||
#pragma unroll
|
||||
for (int m = 0; m < m_count; m++) {
|
||||
@ -574,12 +583,15 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
|
||||
const uint32_t* __restrict__ b_gptq_qzeros,
|
||||
const half* __restrict__ b_gptq_scales, half* __restrict__ c,
|
||||
const int size_m, const int size_n, const int size_k, const int groups,
|
||||
const int* __restrict__ b_q_perm) {
|
||||
const bool use_v2_format, const int* __restrict__ b_q_perm) {
|
||||
MatrixView_half a_(a, size_m, size_k);
|
||||
MatrixView_half_rw c_(c, size_m, size_n);
|
||||
MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
auto t = threadIdx.x;
|
||||
|
||||
// Block
|
||||
@ -658,13 +670,13 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
|
||||
|
||||
half2 dq[4][4];
|
||||
dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n,
|
||||
zeros[0] + 1);
|
||||
zeros[0] + zero_offset);
|
||||
dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n,
|
||||
zeros[1] + 1);
|
||||
zeros[1] + zero_offset);
|
||||
dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n,
|
||||
zeros[2] + 1);
|
||||
zeros[2] + zero_offset);
|
||||
dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n,
|
||||
zeros[3] + 1);
|
||||
zeros[3] + zero_offset);
|
||||
|
||||
for (int m = 0; m < m_count; m++) {
|
||||
block_c[m][0] =
|
||||
@ -730,7 +742,8 @@ void gemm_half_q_half_cuda_part(const half* a, const uint32_t* b_q_weight,
|
||||
const uint32_t* b_gptq_qzeros,
|
||||
const half* b_gptq_scales, const int* b_q_perm,
|
||||
half* c, int size_m, int size_n, int size_k,
|
||||
int m_count, int groups, int bit) {
|
||||
int m_count, int groups, bool use_v2_format,
|
||||
int bit) {
|
||||
dim3 blockDim, gridDim;
|
||||
blockDim.x = BLOCK_KN_SIZE;
|
||||
blockDim.y = 1;
|
||||
@ -743,20 +756,23 @@ void gemm_half_q_half_cuda_part(const half* a, const uint32_t* b_q_weight,
|
||||
pick_gemm_half_q_half_gptq_kernel(true, m_count, bit);
|
||||
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
kernel<<<gridDim, blockDim, 0, stream>>>(a, b_q_weight, b_gptq_qzeros,
|
||||
b_gptq_scales, c, size_m, size_n,
|
||||
size_k, groups, b_q_perm);
|
||||
kernel<<<gridDim, blockDim, 0, stream>>>(
|
||||
a, b_q_weight, b_gptq_qzeros, b_gptq_scales, c, size_m, size_n, size_k,
|
||||
groups, use_v2_format, b_q_perm);
|
||||
}
|
||||
|
||||
__global__ void reconstruct_exllama_8bit_kernel(
|
||||
const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
|
||||
const uint32_t* __restrict__ b_gptq_qzeros,
|
||||
const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
|
||||
const int groups, half* __restrict__ b) {
|
||||
const int groups, const bool use_v2_format, half* __restrict__ b) {
|
||||
MatrixView_half_rw b_(b, size_k, size_n);
|
||||
MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
|
||||
@ -812,13 +828,13 @@ __global__ void reconstruct_exllama_8bit_kernel(
|
||||
|
||||
half2 dq[4][4];
|
||||
dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n,
|
||||
zeros[0] + 1);
|
||||
zeros[0] + zero_offset);
|
||||
dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n,
|
||||
zeros[1] + 1);
|
||||
zeros[1] + zero_offset);
|
||||
dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n,
|
||||
zeros[2] + 1);
|
||||
zeros[2] + zero_offset);
|
||||
dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n,
|
||||
zeros[3] + 1);
|
||||
zeros[3] + zero_offset);
|
||||
|
||||
// half* dqh = (half*)dq;
|
||||
if (b_q_perm) {
|
||||
@ -849,11 +865,14 @@ __global__ void reconstruct_exllama_4bit_kernel(
|
||||
const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
|
||||
const uint32_t* __restrict__ b_gptq_qzeros,
|
||||
const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
|
||||
const int groups, half* __restrict__ b) {
|
||||
const int groups, const bool use_v2_format, half* __restrict__ b) {
|
||||
MatrixView_half_rw b_(b, size_k, size_n);
|
||||
MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
|
||||
@ -888,10 +907,10 @@ __global__ void reconstruct_exllama_4bit_kernel(
|
||||
half2 y1y16[4][2];
|
||||
b_gptq_qzeros_.item4(zeros, group, n);
|
||||
b_gptq_scales_.item4_h2(scales, group, n);
|
||||
dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
|
||||
dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
|
||||
dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
|
||||
dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
|
||||
dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
|
||||
dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
|
||||
dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
|
||||
dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@ -904,10 +923,10 @@ __global__ void reconstruct_exllama_4bit_kernel(
|
||||
nextgroup += groupsize;
|
||||
b_gptq_qzeros_.item4(zeros, group, n);
|
||||
b_gptq_scales_.item4_h2(scales, group, n);
|
||||
dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
|
||||
dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
|
||||
dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
|
||||
dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
|
||||
dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
|
||||
dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
|
||||
dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
|
||||
dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
|
||||
}
|
||||
|
||||
for (int p = 0; p < 4; p++) {
|
||||
@ -954,11 +973,14 @@ __global__ void reconstruct_exllama_3bit_kernel(
|
||||
const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
|
||||
const uint32_t* __restrict__ b_gptq_qzeros,
|
||||
const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
|
||||
const int groups, half* __restrict__ b) {
|
||||
const int groups, const bool use_v2_format, half* __restrict__ b) {
|
||||
MatrixView_half_rw b_(b, size_k, size_n);
|
||||
MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
|
||||
@ -1016,13 +1038,13 @@ __global__ void reconstruct_exllama_3bit_kernel(
|
||||
|
||||
half2 dq[4][16];
|
||||
dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0],
|
||||
size_n, zeros[0] + 1);
|
||||
size_n, zeros[0] + zero_offset);
|
||||
dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1],
|
||||
size_n, zeros[1] + 1);
|
||||
size_n, zeros[1] + zero_offset);
|
||||
dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2],
|
||||
size_n, zeros[2] + 1);
|
||||
size_n, zeros[2] + zero_offset);
|
||||
dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3],
|
||||
size_n, zeros[3] + 1);
|
||||
size_n, zeros[3] + zero_offset);
|
||||
|
||||
if (b_q_perm) {
|
||||
for (int j = 0; j < 16; j++) {
|
||||
@ -1052,11 +1074,14 @@ __global__ void reconstruct_exllama_2bit_kernel(
|
||||
const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
|
||||
const uint32_t* __restrict__ b_gptq_qzeros,
|
||||
const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
|
||||
const int groups, half* __restrict__ b) {
|
||||
const int groups, const bool use_v2_format, half* __restrict__ b) {
|
||||
MatrixView_half_rw b_(b, size_k, size_n);
|
||||
MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
|
||||
@ -1108,10 +1133,10 @@ __global__ void reconstruct_exllama_2bit_kernel(
|
||||
int4 load_int4 = *b_ptr4;
|
||||
|
||||
half2 dq[4][8];
|
||||
dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
|
||||
dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
|
||||
dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
|
||||
dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
|
||||
dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + zero_offset);
|
||||
dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + zero_offset);
|
||||
dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + zero_offset);
|
||||
dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + zero_offset);
|
||||
|
||||
b_ptr += size_n;
|
||||
// half* dqh = (half*)dq;
|
||||
@ -1143,7 +1168,7 @@ void reconstruct_exllama(const uint32_t* b_q_weight,
|
||||
const uint32_t* b_gptq_qzeros,
|
||||
const half* b_gptq_scales, const int* b_q_perm,
|
||||
half* out, int height, int width, int groups,
|
||||
int bit) {
|
||||
bool use_v2_format, int bit) {
|
||||
dim3 blockDim, gridDim;
|
||||
blockDim.x = BLOCK_KN_SIZE;
|
||||
blockDim.y = 1;
|
||||
@ -1162,14 +1187,14 @@ void reconstruct_exllama(const uint32_t* b_q_weight,
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
reconstruct_exllama_kernel<<<gridDim, blockDim, 0, stream>>>(
|
||||
b_q_weight, b_q_perm, b_gptq_qzeros, b_gptq_scales, height, width, groups,
|
||||
out);
|
||||
use_v2_format, out);
|
||||
}
|
||||
|
||||
__global__ void gemm_half_q_half_alt_4bit_kernel(
|
||||
const half2* __restrict__ vec, const uint32_t* __restrict__ mat,
|
||||
half* __restrict__ mul, const half* __restrict__ scales,
|
||||
const uint32_t* __restrict__ zeros, const int* __restrict__ g_idx,
|
||||
int batch, int height, int width) {
|
||||
int batch, int height, int width, bool use_v2_format) {
|
||||
int zero_width = width / 8;
|
||||
int vec_height = height * 4;
|
||||
const int blockwidth2 = BLOCK_KN_SIZE / 2;
|
||||
@ -1179,6 +1204,9 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
|
||||
int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4;
|
||||
auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
__shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
|
||||
if (threadIdx.x < h_end) {
|
||||
for (int m = 0; m < b_end; ++m) {
|
||||
@ -1223,10 +1251,11 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
|
||||
half2 zero = __halves2half2(
|
||||
__hmul(scale_f,
|
||||
__int2half_rn(-((zeros[g * zero_width + z_w] >> z_mod) & 0xF) -
|
||||
1)),
|
||||
__hmul(scale_f2,
|
||||
__int2half_rn(
|
||||
-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xF) - 1)));
|
||||
zero_offset)),
|
||||
__hmul(
|
||||
scale_f2,
|
||||
__int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xF) -
|
||||
zero_offset)));
|
||||
scales_tmp[tmp_k] = scale;
|
||||
zeros_tmp[tmp_k] = zero;
|
||||
}
|
||||
@ -1268,7 +1297,7 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
|
||||
const half2* __restrict__ vec, const uint32_t* __restrict__ mat,
|
||||
half* __restrict__ mul, const half* __restrict__ scales,
|
||||
const uint32_t* __restrict__ zeros, const int* __restrict__ g_idx,
|
||||
int batch, int height, int width) {
|
||||
int batch, int height, int width, bool use_v2_format) {
|
||||
int zero_width = width / 4;
|
||||
int vec_height = height * 2;
|
||||
const int blockwidth2 = BLOCK_KN_SIZE / 2;
|
||||
@ -1278,6 +1307,9 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
|
||||
int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2;
|
||||
auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
__shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
|
||||
if (threadIdx.x < h_end) {
|
||||
for (int m = 0; m < b_end; ++m) {
|
||||
@ -1312,12 +1344,13 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
|
||||
half scale_f2 = scales[g2 * width + w];
|
||||
half2 scale = __halves2half2(scale_f, scale_f2);
|
||||
half2 zero = __halves2half2(
|
||||
__hmul(scale_f,
|
||||
__int2half_rn(
|
||||
-((zeros[g * zero_width + z_w] >> z_mod) & 0xff) - 1)),
|
||||
__hmul(scale_f2,
|
||||
__int2half_rn(
|
||||
-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) - 1)));
|
||||
__hmul(scale_f, __int2half_rn(
|
||||
-((zeros[g * zero_width + z_w] >> z_mod) & 0xff) -
|
||||
zero_offset)),
|
||||
__hmul(
|
||||
scale_f2,
|
||||
__int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) -
|
||||
zero_offset)));
|
||||
scales_tmp[tmp_k] = scale;
|
||||
zeros_tmp[tmp_k] = zero;
|
||||
}
|
||||
@ -1355,7 +1388,7 @@ void gemm_half_q_half_alt(const half* a, const uint32_t* b_q_weight,
|
||||
const uint32_t* b_gptq_qzeros,
|
||||
const half* b_gptq_scales, const int* b_g_idx,
|
||||
half* c, int size_m, int size_n, int size_k,
|
||||
int bit) {
|
||||
bool use_v2_format, int bit) {
|
||||
dim3 blockDim, gridDim;
|
||||
blockDim.x = BLOCK_KN_SIZE;
|
||||
blockDim.y = 1;
|
||||
@ -1372,17 +1405,15 @@ void gemm_half_q_half_alt(const half* a, const uint32_t* b_q_weight,
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
kernel<<<gridDim, blockDim, 0, stream>>>(
|
||||
(const half2*)a, b_q_weight, c, b_gptq_scales, b_gptq_qzeros, b_g_idx,
|
||||
size_m, size_k / 32 * bit, size_n);
|
||||
size_m, size_k / 32 * bit, size_n, use_v2_format);
|
||||
}
|
||||
|
||||
template <class T, int bit>
|
||||
__global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
|
||||
const half* __restrict__ w_scales,
|
||||
const uint32_t* __restrict__ w_zeros,
|
||||
const int* __restrict__ g_idx,
|
||||
const int height, const int width,
|
||||
const int group,
|
||||
half* __restrict__ out) {
|
||||
__global__ void reconstruct_gptq_kernel(
|
||||
const uint32_t* __restrict__ w, const half* __restrict__ w_scales,
|
||||
const uint32_t* __restrict__ w_zeros, const int* __restrict__ g_idx,
|
||||
const int height, const int width, const int group,
|
||||
const bool use_v2_format, half* __restrict__ out) {
|
||||
// Start of block
|
||||
|
||||
auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
@ -1395,6 +1426,9 @@ __global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
|
||||
MatrixView_half w_scales_(w_scales, group, width);
|
||||
T w_zeros_(w_zeros, group, width);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
uint32_t w_read = w[blockIdx.y * width + column];
|
||||
half* out_ptr = out_.item_ptr(row, column);
|
||||
|
||||
@ -1402,7 +1436,7 @@ __global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
|
||||
for (int s = 0; s < 32; s += bit) {
|
||||
int group = g_idx[row + s / bit];
|
||||
half w_scale = w_scales_.item(group, column);
|
||||
uint32_t w_zero = w_zeros_.item(group, column) + 1;
|
||||
uint32_t w_zero = w_zeros_.item(group, column) + zero_offset;
|
||||
half w_item =
|
||||
__hmul(__int2half_rn((int)((w_read >> s) & ((1 << bit) - 1)) - w_zero),
|
||||
w_scale);
|
||||
@ -1415,7 +1449,7 @@ __global__ void reconstruct_gptq_3bit_kernel(
|
||||
const uint32_t* __restrict__ w, const half* __restrict__ w_scales,
|
||||
const uint32_t* __restrict__ w_zeros, const int* __restrict__ g_idx,
|
||||
const int height, const int width, const int group,
|
||||
half* __restrict__ out) {
|
||||
const bool use_v2_format, half* __restrict__ out) {
|
||||
// Start of block
|
||||
auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
auto row = blockIdx.y * 32;
|
||||
@ -1427,6 +1461,9 @@ __global__ void reconstruct_gptq_3bit_kernel(
|
||||
MatrixView_half w_scales_(w_scales, group, width);
|
||||
MatrixView_q3_row w_zeros_(w_zeros, group, width);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
uint32_t w1 = w[(blockIdx.y * 3) * width + column];
|
||||
uint32_t w2 = w[(blockIdx.y * 3 + 1) * width + column];
|
||||
uint32_t w3 = w[(blockIdx.y * 3 + 2) * width + column];
|
||||
@ -1436,7 +1473,7 @@ __global__ void reconstruct_gptq_3bit_kernel(
|
||||
for (int i = 0; i < 32; i += 1) {
|
||||
int group = g_idx[row + i];
|
||||
half w_scale = w_scales_.item(group, column);
|
||||
uint32_t w_zero = w_zeros_.item(group, column) + 1;
|
||||
uint32_t w_zero = w_zeros_.item(group, column) + zero_offset;
|
||||
int w_item;
|
||||
if (i == 10) {
|
||||
w_item = (w1 >> 30) | ((w2 << 2) & 0x4);
|
||||
@ -1456,7 +1493,8 @@ __global__ void reconstruct_gptq_3bit_kernel(
|
||||
|
||||
void reconstruct_gptq(const uint32_t* b_q_weight, const uint32_t* b_gptq_qzeros,
|
||||
const half* b_gptq_scales, const int* b_g_idx, half* out,
|
||||
int height, int width, int groups, int bit) {
|
||||
int height, int width, int groups, bool use_v2_format,
|
||||
int bit) {
|
||||
dim3 blockDim, gridDim;
|
||||
blockDim.x = BLOCK_KN_SIZE;
|
||||
blockDim.y = 1;
|
||||
@ -1476,7 +1514,7 @@ void reconstruct_gptq(const uint32_t* b_q_weight, const uint32_t* b_gptq_qzeros,
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
kernel<<<gridDim, blockDim, 0, stream>>>(b_q_weight, b_gptq_scales,
|
||||
b_gptq_qzeros, b_g_idx, height,
|
||||
width, groups, out);
|
||||
width, groups, use_v2_format, out);
|
||||
}
|
||||
|
||||
void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
|
||||
@ -1484,7 +1522,8 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
|
||||
const uint32_t* b_gptq_qzeros,
|
||||
const half* b_gptq_scales, const int* b_g_idx,
|
||||
half* c, half* temp_dq, int size_m, int size_n,
|
||||
int size_k, int groups, bool use_exllama, int bit) {
|
||||
int size_k, int groups, bool use_exllama,
|
||||
bool use_v2_format, int bit) {
|
||||
bool use_reconstruct;
|
||||
if (use_exllama) {
|
||||
use_reconstruct = ((bit == 8 && size_m > MAX_Q_GEMM_ROWS_8BIT) ||
|
||||
@ -1498,10 +1537,10 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
|
||||
// Reconstruct FP16 matrix, then cuBLAS
|
||||
if (use_exllama) {
|
||||
reconstruct_exllama(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
|
||||
temp_dq, size_k, size_n, groups, bit);
|
||||
temp_dq, size_k, size_n, groups, use_v2_format, bit);
|
||||
} else {
|
||||
reconstruct_gptq(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
|
||||
temp_dq, size_k, size_n, groups, bit);
|
||||
temp_dq, size_k, size_n, groups, use_v2_format, bit);
|
||||
}
|
||||
|
||||
const half alpha = __float2half(1.0f);
|
||||
@ -1517,18 +1556,18 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
|
||||
if (max_chunks) {
|
||||
gemm_half_q_half_cuda_part(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
|
||||
b_g_idx, c, last_chunk, size_n, size_k,
|
||||
BLOCK_M_SIZE_MAX, groups, bit);
|
||||
BLOCK_M_SIZE_MAX, groups, use_v2_format, bit);
|
||||
}
|
||||
|
||||
if (last_chunk_size) {
|
||||
gemm_half_q_half_cuda_part(a + last_chunk * size_k, b_q_weight,
|
||||
b_gptq_qzeros, b_gptq_scales, b_g_idx,
|
||||
c + last_chunk * size_n, last_chunk_size,
|
||||
size_n, size_k, last_chunk_size, groups, bit);
|
||||
gemm_half_q_half_cuda_part(
|
||||
a + last_chunk * size_k, b_q_weight, b_gptq_qzeros, b_gptq_scales,
|
||||
b_g_idx, c + last_chunk * size_n, last_chunk_size, size_n, size_k,
|
||||
last_chunk_size, groups, use_v2_format, bit);
|
||||
}
|
||||
} else {
|
||||
gemm_half_q_half_alt(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
|
||||
c, size_m, size_n, size_k, bit);
|
||||
c, size_m, size_n, size_k, use_v2_format, bit);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1815,7 +1854,7 @@ void shuffle_exllama_weight(uint32_t* q_weight, int* q_perm, int height,
|
||||
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
|
||||
torch::Tensor b_gptq_qzeros,
|
||||
torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
|
||||
bool use_exllama, int64_t bit) {
|
||||
bool use_exllama, bool use_v2_format, int64_t bit) {
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
|
||||
auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
|
||||
at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options);
|
||||
@ -1833,7 +1872,7 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
|
||||
c.size(1), // n
|
||||
a.size(1), // k
|
||||
b_gptq_qzeros.size(0), // group number
|
||||
use_exllama, bit);
|
||||
use_exllama, use_v2_format, bit);
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
107
csrc/sampler.cu
107
csrc/sampler.cu
@ -54,15 +54,10 @@ static inline __device__ uint16_t extractBinIdx(float x) {
|
||||
return 511 - (tmp.u16 >> 7);
|
||||
}
|
||||
|
||||
template <int kNumThreadsPerBlock = 512>
|
||||
static __global__ void topKPerRow(const float* logits, const int* rowStarts,
|
||||
const int* rowEnds, int* outIndices,
|
||||
float* outLogits, int stride0, int stride1) {
|
||||
// The number of bins in the histogram.
|
||||
static constexpr int kNumBins = 512;
|
||||
|
||||
// The top-k width.
|
||||
static constexpr int kTopK = 2048;
|
||||
template <int kNumThreadsPerBlock = 512, int kNumBins = 512, int kTopK = 2048>
|
||||
__device__ void topKPerRowJob(const float* logits, const int rowStart,
|
||||
const int rowEnd, const int rowIdx,
|
||||
int* outIndices, int stride0, int stride1) {
|
||||
// The number of elements per thread for the final top-k sort.
|
||||
static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock;
|
||||
// The class to sort the elements during the final top-k sort.
|
||||
@ -103,17 +98,11 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
|
||||
__shared__ int smemHistogram[kNumBins];
|
||||
// Shared memory to store the selected indices.
|
||||
__shared__ int smemIndices[kTopK];
|
||||
// Shared memory to store the selected logits.
|
||||
__shared__ float smemLogits[kTopK];
|
||||
// Shared memory to store the threshold bin.
|
||||
__shared__ int smemThresholdBinIdx[1];
|
||||
// Shared memory counter to register the candidates for the final phase.
|
||||
__shared__ int smemFinalDstIdx[1];
|
||||
|
||||
// The row computed by this block.
|
||||
int rowIdx = blockIdx.x;
|
||||
// The range of logits within the row.
|
||||
int rowStart = rowStarts[rowIdx], rowEnd = rowEnds[rowIdx];
|
||||
// The length of the row.
|
||||
int rowLen = rowEnd - rowStart;
|
||||
|
||||
@ -124,13 +113,10 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
|
||||
rowIt += kNumThreadsPerBlock) {
|
||||
int idx = rowStart + rowIt;
|
||||
outIndices[rowIdx * kTopK + rowIt] = idx - rowStart;
|
||||
outLogits[rowIdx * kTopK + rowIt] =
|
||||
logits[rowIdx * stride0 + idx * stride1];
|
||||
}
|
||||
for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK;
|
||||
rowIt += kNumThreadsPerBlock) {
|
||||
outIndices[rowIdx * kTopK + rowIt] = -1;
|
||||
outLogits[rowIdx * kTopK + rowIt] = -FLT_MAX;
|
||||
}
|
||||
return;
|
||||
}
|
||||
@ -201,7 +187,6 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
|
||||
uint16_t idx = extractBinIdx(logit);
|
||||
if (idx < thresholdBinIdx) {
|
||||
int dstIdx = atomicAdd(&smemHistogram[idx], 1);
|
||||
smemLogits[dstIdx] = logit;
|
||||
smemIndices[dstIdx] = rowIt;
|
||||
} else if (idx == thresholdBinIdx) {
|
||||
int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
|
||||
@ -250,7 +235,6 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
|
||||
int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
|
||||
int dstIdx = baseIdx + srcIdx;
|
||||
if (dstIdx < kTopK) {
|
||||
smemLogits[dstIdx] = finalLogits[ii];
|
||||
smemIndices[dstIdx] = finalIndices[ii];
|
||||
}
|
||||
}
|
||||
@ -258,31 +242,58 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
|
||||
// Make sure the data is in shared memory.
|
||||
__syncthreads();
|
||||
|
||||
// The topK logits.
|
||||
float topKLogits[kNumTopKItemsPerThread];
|
||||
// The topK indices.
|
||||
int topKIndices[kNumTopKItemsPerThread];
|
||||
|
||||
// Load from shared memory.
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
|
||||
topKLogits[ii] = smemLogits[ii * kNumThreadsPerBlock + threadIdx.x];
|
||||
topKIndices[ii] = smemIndices[ii * kNumThreadsPerBlock + threadIdx.x];
|
||||
}
|
||||
|
||||
// Sort the elements.
|
||||
TopKSort(smemFinal.topKSort)
|
||||
.SortDescendingBlockedToStriped(topKLogits, topKIndices);
|
||||
|
||||
// Store to global memory.
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
|
||||
int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x;
|
||||
outIndices[offset] = topKIndices[ii] - rowStart;
|
||||
outLogits[offset] = topKLogits[ii];
|
||||
outIndices[offset] =
|
||||
smemIndices[ii * kNumThreadsPerBlock + threadIdx.x] - rowStart;
|
||||
}
|
||||
}
|
||||
|
||||
template <int kNumThreadsPerBlock = 512>
|
||||
static __global__ void topKPerRow(const float* logits, const int* rowStarts,
|
||||
const int* rowEnds, int* outIndices,
|
||||
int stride0, int stride1) {
|
||||
// The number of bins in the histogram.
|
||||
static constexpr int kNumBins = 512;
|
||||
|
||||
// The top-k width.
|
||||
static constexpr int kTopK = 2048;
|
||||
|
||||
// The row computed by this block.
|
||||
int rowIdx = blockIdx.x;
|
||||
|
||||
// The range of logits within the row.
|
||||
int rowStart = rowStarts[rowIdx];
|
||||
int rowEnd = rowEnds[rowIdx];
|
||||
|
||||
topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
|
||||
logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
|
||||
}
|
||||
|
||||
template <int kNumThreadsPerBlock = 512>
|
||||
static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
|
||||
int* outIndices, int stride0,
|
||||
int stride1, int next_n) {
|
||||
// The number of bins in the histogram.
|
||||
static constexpr int kNumBins = 512;
|
||||
|
||||
// The top-k width.
|
||||
static constexpr int kTopK = 2048;
|
||||
|
||||
// The row computed by this block.
|
||||
int rowIdx = blockIdx.x;
|
||||
|
||||
// The range of logits within the row.
|
||||
int rowStart = 0;
|
||||
int seq_len = seqLens[rowIdx / next_n];
|
||||
int rowEnd = seq_len - next_n + (rowIdx % next_n) + 1;
|
||||
|
||||
topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
|
||||
logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
void apply_repetition_penalties_(
|
||||
@ -326,10 +337,23 @@ void apply_repetition_penalties_(
|
||||
});
|
||||
}
|
||||
|
||||
void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
|
||||
const torch::Tensor& seqLens, torch::Tensor& indices,
|
||||
int64_t numRows, int64_t stride0, int64_t stride1) {
|
||||
// Compute the results on the device.
|
||||
constexpr int kNumThreadsPerBlock = 512;
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
vllm::topKPerRowDecode<kNumThreadsPerBlock>
|
||||
<<<numRows, kNumThreadsPerBlock, 0, stream>>>(
|
||||
logits.data_ptr<float>(), seqLens.data_ptr<int>(),
|
||||
indices.data_ptr<int>(), static_cast<int>(stride0),
|
||||
static_cast<int>(stride1), static_cast<int>(next_n));
|
||||
}
|
||||
|
||||
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
|
||||
const torch::Tensor& rowEnds, torch::Tensor& indices,
|
||||
torch::Tensor& values, int64_t numRows, int64_t stride0,
|
||||
int64_t stride1) {
|
||||
int64_t numRows, int64_t stride0, int64_t stride1) {
|
||||
// Compute the results on the device.
|
||||
constexpr int kNumThreadsPerBlock = 512;
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
@ -338,6 +362,5 @@ void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
|
||||
<<<numRows, kNumThreadsPerBlock, 0, stream>>>(
|
||||
logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
|
||||
rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
|
||||
values.data_ptr<float>(), static_cast<int>(stride0),
|
||||
static_cast<int>(stride1));
|
||||
static_cast<int>(stride0), static_cast<int>(stride1));
|
||||
}
|
||||
|
||||
@ -185,10 +185,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// Optimized top-k per row operation
|
||||
ops.def(
|
||||
"top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
|
||||
"Tensor! indices, Tensor! values, int numRows, int stride0, "
|
||||
"Tensor! indices, int numRows, int stride0, "
|
||||
"int stride1) -> ()");
|
||||
ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row);
|
||||
|
||||
ops.def(
|
||||
"top_k_per_row_decode(Tensor logits, int next_n, "
|
||||
"Tensor seq_lens, Tensor! indices, int numRows, "
|
||||
"int stride0, int stride1) -> ()");
|
||||
ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode);
|
||||
|
||||
// Layernorm-quant
|
||||
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
||||
ops.def(
|
||||
@ -551,7 +557,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// to prevent the meta function registry.
|
||||
ops.def(
|
||||
"gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
|
||||
"Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
|
||||
"Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, bool "
|
||||
"use_v2_format, int bit) "
|
||||
"-> Tensor",
|
||||
{stride_tag});
|
||||
ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
# docs/contributing/dockerfile/dockerfile.md and
|
||||
# docs/assets/contributing/dockerfile-stages-dependency.png
|
||||
|
||||
ARG CUDA_VERSION=12.8.1
|
||||
ARG CUDA_VERSION=12.9.1
|
||||
ARG PYTHON_VERSION=3.12
|
||||
|
||||
# By parameterizing the base images, we allow third-party to use their own
|
||||
@ -132,7 +132,9 @@ WORKDIR /workspace
|
||||
COPY requirements/common.txt requirements/common.txt
|
||||
COPY requirements/cuda.txt requirements/cuda.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
|
||||
# TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
|
||||
uv pip install --python /opt/venv/bin/python3 --pre apache-tvm-ffi==0.1.0b15 \
|
||||
&& uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
# cuda arch list used by torch
|
||||
@ -273,6 +275,7 @@ WORKDIR /vllm-workspace
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
|
||||
ARG GDRCOPY_CUDA_VERSION=12.8
|
||||
# Keep in line with FINAL_BASE_IMAGE
|
||||
ARG GDRCOPY_OS_VERSION=Ubuntu22_04
|
||||
@ -353,9 +356,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
# Install vllm wheel first, so that torch etc will be installed.
|
||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system dist/*.whl --verbose \
|
||||
# TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
|
||||
uv pip install --system --pre apache-tvm-ffi==0.1.0b15 \
|
||||
&& uv pip install --system dist/*.whl --verbose \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
# TODO (huydhn): Remove this once xformers is released for 2.9.0
|
||||
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
||||
. /etc/environment
|
||||
export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
|
||||
uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
|
||||
BASH
|
||||
|
||||
# Install FlashInfer pre-compiled kernel cache and binaries
|
||||
# https://docs.flashinfer.ai/installation.html
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
@ -422,6 +434,7 @@ ARG PYTHON_VERSION
|
||||
|
||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
@ -434,7 +447,8 @@ ENV UV_LINK_MODE=copy
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
|
||||
if [ "$CUDA_MAJOR" -ge 12 ]; then \
|
||||
uv pip install --system -r requirements/dev.txt; \
|
||||
uv pip install --system -r requirements/dev.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
fi
|
||||
|
||||
# install development dependencies (for testing)
|
||||
|
||||
@ -31,7 +31,7 @@ ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||
apt-get update -y \
|
||||
&& apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
|
||||
&& apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
|
||||
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
|
||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
|
||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
@ -79,6 +79,9 @@ RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||
######################### BUILD IMAGE #########################
|
||||
FROM base AS vllm-build
|
||||
|
||||
ARG max_jobs=2
|
||||
ENV MAX_JOBS=${max_jobs}
|
||||
|
||||
ARG GIT_REPO_CHECK=0
|
||||
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
||||
ARG VLLM_CPU_DISABLE_AVX512=0
|
||||
@ -104,16 +107,20 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
|
||||
|
||||
######################### TEST DEPS #########################
|
||||
FROM base AS vllm-test-deps
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
|
||||
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
|
||||
cp requirements/test.in requirements/cpu-test.in && \
|
||||
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
|
||||
sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
|
||||
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
|
||||
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
|
||||
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
|
||||
ARG TRITON_BRANCH="f9e5bf54"
|
||||
ARG TRITON_BRANCH="57c693b6"
|
||||
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
|
||||
ARG PYTORCH_BRANCH="b2fb6885"
|
||||
ARG PYTORCH_BRANCH="1c57644d"
|
||||
ARG PYTORCH_VISION_BRANCH="v0.23.0"
|
||||
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
|
||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||
ARG FA_BRANCH="0e60e394"
|
||||
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
|
||||
ARG AITER_BRANCH="2ab9f4cd"
|
||||
ARG AITER_BRANCH="eef23c7f"
|
||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 119 KiB After Width: | Height: | Size: 119 KiB |
@ -6,7 +6,8 @@ toc_depth: 4
|
||||
|
||||
vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
|
||||
|
||||
- **[Benchmark CLI]**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
|
||||
- **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
|
||||
- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
|
||||
- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
|
||||
- **[Nightly benchmarks](#nightly-benchmarks)**: Comparative benchmarks against alternatives
|
||||
|
||||
@ -29,7 +30,7 @@ th {
|
||||
| Dataset | Online | Offline | Data Path |
|
||||
|---------|--------|---------|-----------|
|
||||
| ShareGPT | ✅ | ✅ | `wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json` |
|
||||
| ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
|
||||
| ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
|
||||
| ShareGPT4Video (Video) | ✅ | ✅ | `git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video` |
|
||||
| BurstGPT | ✅ | ✅ | `wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv` |
|
||||
| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
|
||||
@ -714,7 +715,7 @@ Generate synthetic image inputs alongside random text prompts to stress-test vis
|
||||
|
||||
Notes:
|
||||
|
||||
- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
|
||||
- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
|
||||
- Video sampling is not yet implemented.
|
||||
|
||||
Start the server (example):
|
||||
@ -924,6 +925,163 @@ throughput numbers correctly is also adjusted.
|
||||
|
||||
</details>
|
||||
|
||||
## Parameter Sweeps
|
||||
|
||||
### Online Benchmark
|
||||
|
||||
[`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations.
|
||||
|
||||
Follow these steps to run the script:
|
||||
|
||||
1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option.
|
||||
2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option.
|
||||
3. (Optional) If you would like to vary the settings of `vllm serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--serve-params`.
|
||||
|
||||
- Example: Tuning `--max-num-seqs` and `--max-num-batched-tokens`:
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"max_num_seqs": 32,
|
||||
"max_num_batched_tokens": 1024
|
||||
},
|
||||
{
|
||||
"max_num_seqs": 64,
|
||||
"max_num_batched_tokens": 1024
|
||||
},
|
||||
{
|
||||
"max_num_seqs": 64,
|
||||
"max_num_batched_tokens": 2048
|
||||
},
|
||||
{
|
||||
"max_num_seqs": 128,
|
||||
"max_num_batched_tokens": 2048
|
||||
},
|
||||
{
|
||||
"max_num_seqs": 128,
|
||||
"max_num_batched_tokens": 4096
|
||||
},
|
||||
{
|
||||
"max_num_seqs": 256,
|
||||
"max_num_batched_tokens": 4096
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
4. (Optional) If you would like to vary the settings of `vllm bench serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--bench-params`.
|
||||
|
||||
- Example: Using different input/output lengths for random dataset:
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"random_input_len": 128,
|
||||
"random_output_len": 32
|
||||
},
|
||||
{
|
||||
"random_input_len": 256,
|
||||
"random_output_len": 64
|
||||
},
|
||||
{
|
||||
"random_input_len": 512,
|
||||
"random_output_len": 128
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
5. Determine where you want to save the results, and pass that to `--output-dir`.
|
||||
|
||||
Example command:
|
||||
|
||||
```bash
|
||||
python -m vllm.benchmarks.sweep.serve \
|
||||
--serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
|
||||
--bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
|
||||
--serve-params benchmarks/serve_hparams.json \
|
||||
--bench-params benchmarks/bench_hparams.json \
|
||||
-o benchmarks/results
|
||||
```
|
||||
|
||||
!!! important
|
||||
If both `--serve-params` and `--bench-params` are passed, the script will iterate over the Cartesian product between them.
|
||||
You can use `--dry-run` to preview the commands to be run.
|
||||
|
||||
We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
|
||||
Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
|
||||
In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
|
||||
|
||||
!!! note
|
||||
By default, each parameter combination is run 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
|
||||
|
||||
!!! tip
|
||||
You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
|
||||
|
||||
### SLA Auto-Tuner
|
||||
|
||||
[`vllm/benchmarks/sweep/serve_sla.py`](../../vllm/benchmarks/sweep/serve_sla.py) is a wrapper over [`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`.
|
||||
|
||||
For example, to ensure E2E latency within different target values for 99% of requests:
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"p99_e2el_ms": "<=200"
|
||||
},
|
||||
{
|
||||
"p99_e2el_ms": "<=500"
|
||||
},
|
||||
{
|
||||
"p99_e2el_ms": "<=1000"
|
||||
},
|
||||
{
|
||||
"p99_e2el_ms": "<=2000"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
Example command:
|
||||
|
||||
```bash
|
||||
python -m vllm.benchmarks.sweep.serve_sla \
|
||||
--serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
|
||||
--bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
|
||||
--serve-params benchmarks/serve_hparams.json \
|
||||
--bench-params benchmarks/bench_hparams.json \
|
||||
--sla-params benchmarks/sla_hparams.json \
|
||||
--sla-variable max_concurrency \
|
||||
-o benchmarks/results
|
||||
```
|
||||
|
||||
The algorithm for adjusting the SLA variable is as follows:
|
||||
|
||||
1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
|
||||
- For example, the initial request rate is set to the concurrency under infinite QPS.
|
||||
2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
|
||||
3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
|
||||
|
||||
!!! important
|
||||
SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
|
||||
|
||||
For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
|
||||
|
||||
### Visualizer
|
||||
|
||||
[`vllm/benchmarks/sweep/plot.py`](../../vllm/benchmarks/sweep/plot.py) can be used to plot performance curves from parameter sweep results.
|
||||
|
||||
Example command:
|
||||
|
||||
```bash
|
||||
python -m vllm.benchmarks.sweep.plot benchmarks/results/<timestamp> \
|
||||
--var-x max_concurrency \
|
||||
--row-by random_input_len \
|
||||
--col-by random_output_len \
|
||||
--curve-by api_server_count,max_num_batched_tokens \
|
||||
--filter-by 'max_concurrency<=1024'
|
||||
```
|
||||
|
||||
!!! tip
|
||||
You can use `--dry-run` to preview the figures to be plotted.
|
||||
|
||||
## Performance Benchmarks
|
||||
|
||||
The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
|
||||
|
||||
@ -87,7 +87,7 @@ is ineffective.
|
||||
|
||||
While ongoing efforts like <https://github.com/vllm-project/vllm/issues/17419>
|
||||
address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
|
||||
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
|
||||
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/long_build`)
|
||||
when manually triggering a build on Buildkite. This branch accomplishes two things:
|
||||
|
||||
1. Increase the timeout limit to 10 hours so that the build doesn't time out.
|
||||
@ -100,35 +100,17 @@ to warm it up so that future builds are faster.
|
||||
|
||||
## Update dependencies
|
||||
|
||||
Several vLLM dependencies, such as FlashInfer, also depend on PyTorch and need
|
||||
Several vLLM dependencies like xFormers depend on PyTorch and need
|
||||
to be updated accordingly. Rather than waiting for all of them to publish new
|
||||
releases (which would take too much time), they can be built from
|
||||
source to unblock the update process.
|
||||
|
||||
### FlashInfer
|
||||
|
||||
Here is how to build and install it from source with `torch2.7.0+cu128` in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
|
||||
|
||||
```bash
|
||||
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
|
||||
export FLASHINFER_ENABLE_SM90=1
|
||||
uv pip install --system \
|
||||
--no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
|
||||
```
|
||||
|
||||
One caveat is that building FlashInfer from source adds approximately 30
|
||||
minutes to the vLLM build time. Therefore, it's preferable to cache the wheel in a
|
||||
public location for immediate installation, such as [this FlashInfer wheel link](https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl). For future releases, contact the PyTorch release
|
||||
team if you want to get the package published there.
|
||||
|
||||
### xFormers
|
||||
|
||||
Similar to FlashInfer, here is how to build and install xFormers from source:
|
||||
|
||||
```bash
|
||||
export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
|
||||
export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
|
||||
MAX_JOBS=16 uv pip install --system \
|
||||
--no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
|
||||
--no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
|
||||
```
|
||||
|
||||
## Update all the different vLLM platforms
|
||||
|
||||
@ -167,7 +167,7 @@ class AttentionCGSupport(enum.Enum):
|
||||
"""NO CUDA Graphs support"""
|
||||
```
|
||||
|
||||
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
|
||||
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code for [this][vllm.v1.worker.gpu_model_runner.GPUModelRunner._check_and_update_cudagraph_mode].
|
||||
|
||||
The following table lists backends that support full CUDA Graphs at the time of writing.
|
||||
|
||||
|
||||
@ -13,7 +13,6 @@ IOProcessorInput = TypeVar("IOProcessorInput")
|
||||
IOProcessorOutput = TypeVar("IOProcessorOutput")
|
||||
|
||||
class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||
|
||||
def __init__(self, vllm_config: VllmConfig):
|
||||
self.vllm_config = vllm_config
|
||||
|
||||
@ -49,13 +48,24 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||
request_id: str | None = None,
|
||||
**kwargs,
|
||||
) -> IOProcessorOutput:
|
||||
collected_output = [item async for i, item in model_output]
|
||||
# We cannot guarantee outputs are returned in the same order they were
|
||||
# fed to vLLM.
|
||||
# Let's sort them by id before post_processing
|
||||
sorted_output = sorted(
|
||||
[(i, item) async for i, item in model_output], key=lambda output: output[0]
|
||||
)
|
||||
collected_output = [output[1] for output in sorted_output]
|
||||
return self.post_process(collected_output, request_id, **kwargs)
|
||||
|
||||
@abstractmethod
|
||||
def parse_request(self, request: Any) -> IOProcessorInput:
|
||||
raise NotImplementedError
|
||||
|
||||
def validate_or_generate_params(
|
||||
self, params: SamplingParams | PoolingParams | None = None
|
||||
) -> SamplingParams | PoolingParams:
|
||||
return params or PoolingParams()
|
||||
|
||||
@abstractmethod
|
||||
def output_to_response(
|
||||
self, plugin_output: IOProcessorOutput
|
||||
@ -66,10 +76,10 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||
The `parse_request` method is used for validating the user prompt and converting it into the input expected by the `pre_process`/`pre_process_async` methods.
|
||||
The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
|
||||
The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
|
||||
|
||||
The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters.
|
||||
The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py).
|
||||
|
||||
An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online ([examples/online_serving/prithvi_geospatial_mae.py](../../examples/online_serving/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py)) inference examples.
|
||||
An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/prithvi_geospatial_mae.py](../../examples/online_serving/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py)) inference examples.
|
||||
|
||||
## Using an IO Processor plugin
|
||||
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
# Metrics
|
||||
|
||||
Ensure the v1 LLM Engine exposes a superset of the metrics available in v0.
|
||||
vLLM exposes a rich set of metrics to support observability and capacity planning for the V1 engine.
|
||||
|
||||
## Objectives
|
||||
|
||||
- Achieve parity of metrics between v0 and v1.
|
||||
- The priority use case is accessing these metrics via Prometheus, as this is what we expect to be used in production environments.
|
||||
- Logging support (i.e. printing metrics to the info log) is provided for more ad-hoc testing, debugging, development, and exploratory use cases.
|
||||
- Provide comprehensive coverage of engine and request level metrics to aid production monitoring.
|
||||
- Prioritize Prometheus integrations, as this is what we expect to be used in production environments.
|
||||
- Offer logging support (i.e. printing metrics to the info log) for ad-hoc testing, debugging, development, and exploratory use cases.
|
||||
|
||||
## Background
|
||||
|
||||
@ -17,51 +17,42 @@ Metrics in vLLM can be categorized as follows:
|
||||
|
||||
The mental model is that server-level metrics help explain the values of request-level metrics.
|
||||
|
||||
### v0 Metrics
|
||||
### Metrics Overview
|
||||
|
||||
In v0, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
|
||||
### v1 Metrics
|
||||
|
||||
- `vllm:num_requests_running` (Gauge)
|
||||
- `vllm:num_requests_swapped` (Gauge)
|
||||
- `vllm:num_requests_waiting` (Gauge)
|
||||
- `vllm:gpu_cache_usage_perc` (Gauge)
|
||||
- `vllm:cpu_cache_usage_perc` (Gauge)
|
||||
- `vllm:gpu_prefix_cache_hit_rate` (Gauge)
|
||||
- `vllm:cpu_prefix_cache_hit_rate` (Gauge)
|
||||
- `vllm:prompt_tokens_total` (Counter)
|
||||
- `vllm:generation_tokens_total` (Counter)
|
||||
- `vllm:request_success_total` (Counter)
|
||||
- `vllm:request_prompt_tokens` (Histogram)
|
||||
- `vllm:request_generation_tokens` (Histogram)
|
||||
- `vllm:time_to_first_token_seconds` (Histogram)
|
||||
- `vllm:time_per_output_token_seconds` (Histogram)
|
||||
- `vllm:e2e_request_latency_seconds` (Histogram)
|
||||
- `vllm:request_queue_time_seconds` (Histogram)
|
||||
- `vllm:request_inference_time_seconds` (Histogram)
|
||||
- `vllm:request_prefill_time_seconds` (Histogram)
|
||||
- `vllm:request_decode_time_seconds` (Histogram)
|
||||
- `vllm:request_max_num_generation_tokens` (Histogram)
|
||||
- `vllm:num_preemptions_total` (Counter)
|
||||
- `vllm:cache_config_info` (Gauge)
|
||||
- `vllm:lora_requests_info` (Gauge)
|
||||
- `vllm:tokens_total` (Counter)
|
||||
- `vllm:iteration_tokens_total` (Histogram)
|
||||
- `vllm:time_in_queue_requests` (Histogram)
|
||||
- `vllm:model_forward_time_milliseconds` (Histogram)
|
||||
- `vllm:model_execute_time_milliseconds` (Histogram)
|
||||
- `vllm:request_params_n` (Histogram)
|
||||
- `vllm:request_params_max_tokens` (Histogram)
|
||||
- `vllm:spec_decode_draft_acceptance_rate` (Gauge)
|
||||
- `vllm:spec_decode_efficiency` (Gauge)
|
||||
- `vllm:spec_decode_num_accepted_tokens_total` (Counter)
|
||||
- `vllm:spec_decode_num_draft_tokens_total` (Counter)
|
||||
- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
|
||||
In v1, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
|
||||
|
||||
- `vllm:num_requests_running` (Gauge) - Number of requests currently running.
|
||||
- `vllm:num_requests_waiting` (Gauge) - Number of requests currently waiting.
|
||||
- `vllm:kv_cache_usage_perc` (Gauge) - Fraction of used KV cache blocks (0–1).
|
||||
- `vllm:prefix_cache_queries` (Counter) - Number of prefix cache queries.
|
||||
- `vllm:prefix_cache_hits` (Counter) - Number of prefix cache hits.
|
||||
- `vllm:mm_cache_queries` (Counter) - (For multimodal models) Number of multimodal cache queries.
|
||||
- `vllm:mm_cache_hits` (Counter) - (For multimodal models) Number of multimodal cache hits.
|
||||
- `vllm:num_preemptions_total` (Counter) - Number of preemptions.
|
||||
- `vllm:prompt_tokens_total` (Counter) - Total number of prompt tokens processed.
|
||||
- `vllm:generation_tokens_total` (Counter) - Total number of generated tokens.
|
||||
- `vllm:iteration_tokens_total` (Histogram) - Histogram of tokens processed in each engine step.
|
||||
- `vllm:cache_config_info` (Gauge) - Information about the cache configuration.
|
||||
- `vllm:request_success_total` (Counter) - Number of finished requests (by finish reason).
|
||||
- `vllm:request_prompt_tokens` (Histogram) - Histogram of input prompt token counts.
|
||||
- `vllm:request_generation_tokens` (Histogram) - Histogram of generation token counts.
|
||||
- `vllm:request_params_n` (Histogram) - Histogram of request parameter n.
|
||||
- `vllm:request_params_max_tokens` - (Histogram) - Histogram of max_tokens parameter in requests.
|
||||
- `vllm:time_to_first_token_seconds` (Histogram) - Time to first token (TTFT).
|
||||
- `vllm:inter_token_latency_seconds` (Histogram) - Inter-token latency.
|
||||
- `vllm:e2e_request_latency_seconds` (Histogram) - End-to-end request latency.
|
||||
- `vllm:request_queue_time_seconds` (Histogram) - Time spent in the queue.
|
||||
- `vllm:request_inference_time_seconds` (Histogram) - Request inference time.
|
||||
- `vllm:request_prefill_time_seconds` (Histogram) - Request prefill time.
|
||||
- `vllm:request_decode_time_seconds` (Histogram) - Request decode time.
|
||||
|
||||
These are documented under [Inferencing and Serving -> Production Metrics](../usage/metrics.md).
|
||||
|
||||
### Grafana Dashboard
|
||||
|
||||
vLLM also provides [a reference example](../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
|
||||
vLLM also provides [a reference example](../../examples/online_serving/prometheus_grafana/README.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
|
||||
|
||||
The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
|
||||
|
||||
@ -86,7 +77,7 @@ See [the PR which added this Dashboard](https://github.com/vllm-project/vllm/pul
|
||||
|
||||
Prometheus support was initially added [using the aioprometheus library](https://github.com/vllm-project/vllm/pull/1890), but a switch was made quickly to [prometheus_client](https://github.com/vllm-project/vllm/pull/2730). The rationale is discussed in both linked PRs.
|
||||
|
||||
With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](https://github.com/vllm-project/vllm/pull/15657):
|
||||
During those migrations we briefly lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](https://github.com/vllm-project/vllm/pull/15657):
|
||||
|
||||
```bash
|
||||
$ curl http://0.0.0.0:8000/metrics 2>/dev/null | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
|
||||
@ -99,7 +90,9 @@ http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201
|
||||
|
||||
### Multi-process Mode
|
||||
|
||||
In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <https://github.com/vllm-project/vllm/pull/7279>.
|
||||
Historically, metrics were collected in the engine core process and multiprocess mode was used to make them available in the API server process. See <https://github.com/vllm-project/vllm/pull/7279>.
|
||||
|
||||
More recently, metrics are collected in the API server process and multiprocess mode is only used when `--api-server-count > 1`. See <https://github.com/vllm-project/vllm/pull/17546> and details on [API server scale-out](../serving/data_parallel_deployment.md#internal-load-balancing).
|
||||
|
||||
### Built in Python/Process Metrics
|
||||
|
||||
@ -116,14 +109,15 @@ The following metrics are supported by default by `prometheus_client`, but they
|
||||
- `process_open_fds`
|
||||
- `process_max_fds`
|
||||
|
||||
This is relevant because if we move away from multiprocess mode in v1,
|
||||
we get these back. However, it's questionable how relevant these are
|
||||
if they don't aggregate these stats for all processes that make up a
|
||||
vLLM instance.
|
||||
Therefore, these metrics are unavailable when `--api-server-count > 1`. It's questionable how relevant these are since they do not aggregate these stats for all processes that make up a vLLM instance.
|
||||
|
||||
### v0 PRs and Issues
|
||||
## Metrics Design
|
||||
|
||||
For background, these are some of the relevant PRs which added the v0 metrics:
|
||||
The ["Even Better Observability"](https://github.com/vllm-project/vllm/issues/3616) feature where was where much of the metrics design was planned. For example, see where [a detailed roadmap was laid out](https://github.com/vllm-project/vllm/issues/3616#issuecomment-2030858781).
|
||||
|
||||
### Legacy PRs
|
||||
|
||||
To help understand the background to the metrics design, here are some of the relevant PRs which added the original, now legacy, metrics:
|
||||
|
||||
- <https://github.com/vllm-project/vllm/pull/1890>
|
||||
- <https://github.com/vllm-project/vllm/pull/2316>
|
||||
@ -131,14 +125,9 @@ For background, these are some of the relevant PRs which added the v0 metrics:
|
||||
- <https://github.com/vllm-project/vllm/pull/4464>
|
||||
- <https://github.com/vllm-project/vllm/pull/7279>
|
||||
|
||||
Also note the ["Even Better Observability"](https://github.com/vllm-project/vllm/issues/3616) feature where e.g. [a detailed roadmap was laid out](https://github.com/vllm-project/vllm/issues/3616#issuecomment-2030858781).
|
||||
### Metrics Implementation PRs
|
||||
|
||||
## v1 Design
|
||||
|
||||
### v1 PRs
|
||||
|
||||
For background, here are the relevant v1 PRs relating to the v1
|
||||
metrics issue <https://github.com/vllm-project/vllm/issues/10582>:
|
||||
For background, here are the relevant PRs relating to the metrics implementation <https://github.com/vllm-project/vllm/issues/10582>:
|
||||
|
||||
- <https://github.com/vllm-project/vllm/pull/11962>
|
||||
- <https://github.com/vllm-project/vllm/pull/11973>
|
||||
@ -369,7 +358,7 @@ vllm:cache_config_info{block_size="16",cache_dtype="auto",calculate_kv_scales="F
|
||||
|
||||
However, `prometheus_client` has
|
||||
[never supported Info metrics in multiprocessing mode](https://github.com/prometheus/client_python/pull/300) -
|
||||
for [unclear reasons](https://github.com/vllm-project/vllm/pull/7279#discussion_r1710417152). We
|
||||
for [unclear reasons](gh-pr:7279#discussion_r1710417152). We
|
||||
simply use a `Gauge` metric set to 1 and
|
||||
`multiprocess_mode="mostrecent"` instead.
|
||||
|
||||
@ -396,9 +385,8 @@ recent metric is used, but only from currently running processes.
|
||||
|
||||
This was added in <https://github.com/vllm-project/vllm/pull/9477> and there is
|
||||
[at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54).
|
||||
If we revisit this design and deprecate the old metric, we should reduce
|
||||
the need for a significant deprecation period by making the change in
|
||||
v0 also and asking this project to move to the new metric.
|
||||
If we revisit this design and deprecate the old metric, we should
|
||||
coordinate with downstream users so they can migrate before the removal.
|
||||
|
||||
### Prefix Cache metrics
|
||||
|
||||
@ -478,22 +466,20 @@ us with:
|
||||
|
||||
```python
|
||||
if seq_group.is_finished():
|
||||
if (
|
||||
seq_group.metrics.first_scheduled_time is not None
|
||||
and seq_group.metrics.first_token_time is not None
|
||||
):
|
||||
if (seq_group.metrics.first_scheduled_time is not None and
|
||||
seq_group.metrics.first_token_time is not None):
|
||||
time_queue_requests.append(
|
||||
seq_group.metrics.first_scheduled_time -
|
||||
seq_group.metrics.arrival_time
|
||||
)
|
||||
seq_group.metrics.arrival_time)
|
||||
...
|
||||
if seq_group.metrics.time_in_queue is not None:
|
||||
time_in_queue_requests.append(seq_group.metrics.time_in_queue)
|
||||
time_in_queue_requests.append(
|
||||
seq_group.metrics.time_in_queue)
|
||||
```
|
||||
|
||||
This seems duplicative, and one of them should be removed. The latter
|
||||
is used by the Grafana dashboard, so we should deprecate or remove the
|
||||
former from v0.
|
||||
former.
|
||||
|
||||
### Prefix Cache Hit Rate
|
||||
|
||||
@ -502,7 +488,7 @@ See above - we now expose 'queries' and 'hits' counters rather than a
|
||||
|
||||
### KV Cache Offloading
|
||||
|
||||
Two v0 metrics relate to a "swapped" preemption mode that is no
|
||||
Two legacy metrics relate to a "swapped" preemption mode that is no
|
||||
longer relevant in v1:
|
||||
|
||||
- `vllm:num_requests_swapped`
|
||||
@ -513,7 +499,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
|
||||
memory. This is also known as "KV cache offloading" and is configured
|
||||
with `--swap-space` and `--preemption-mode`.
|
||||
|
||||
In v0, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
|
||||
Historically, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
|
||||
SequenceGroup encapsulated the idea of N Sequences which
|
||||
all shared the same prompt kv blocks. This enabled KV cache block
|
||||
sharing between requests, and copy-on-write to do branching. CPU
|
||||
@ -526,7 +512,7 @@ and the part of the prompt that was evicted can be recomputed.
|
||||
|
||||
SequenceGroup was removed in V1, although a replacement will be
|
||||
required for "parallel sampling" (`n>1`).
|
||||
[Beam search was moved out of the core (in V0)](https://github.com/vllm-project/vllm/issues/8306). There was a
|
||||
[Beam search was moved out of the core](https://github.com/vllm-project/vllm/issues/8306). There was a
|
||||
lot of complex code for a very uncommon feature.
|
||||
|
||||
In V1, with prefix caching being better (zero over head) and therefore
|
||||
@ -537,7 +523,7 @@ better.
|
||||
|
||||
### Parallel Sampling
|
||||
|
||||
Some v0 metrics are only relevant in the context of "parallel
|
||||
Some legacy metrics are only relevant in the context of "parallel
|
||||
sampling". This is where the `n` parameter in a request is used to
|
||||
request multiple completions from the same prompt.
|
||||
|
||||
@ -556,7 +542,7 @@ also add these metrics.
|
||||
|
||||
### Speculative Decoding
|
||||
|
||||
Some v0 metrics are specific to "speculative decoding". This is where
|
||||
Some legacy metrics are specific to "speculative decoding". This is where
|
||||
we generate candidate tokens using a faster, approximate method or
|
||||
model and then validate those tokens with the larger model.
|
||||
|
||||
@ -568,7 +554,7 @@ model and then validate those tokens with the larger model.
|
||||
|
||||
There is a PR under review (<https://github.com/vllm-project/vllm/pull/12193>) to add "prompt lookup (ngram)"
|
||||
speculative decoding to v1. Other techniques will follow. We should
|
||||
revisit the v0 metrics in this context.
|
||||
revisit these metrics in this context.
|
||||
|
||||
!!! note
|
||||
We should probably expose acceptance rate as separate accepted
|
||||
@ -641,7 +627,7 @@ metrics are often relatively straightforward to add:
|
||||
metrics are usually of very limited use unless they can be enabled
|
||||
by default and in production.
|
||||
3. They have an impact on development and maintenance of the
|
||||
project. Every metric added to v0 has made this v1 effort more
|
||||
project. Every metric added over time has made this effort more
|
||||
time-consuming, and perhaps not all metrics justify this ongoing
|
||||
investment in their maintenance.
|
||||
|
||||
@ -652,24 +638,24 @@ performance and health. Tracing, on the other hand, tracks individual
|
||||
requests as they move through different services and components. Both
|
||||
fall under the more general heading of "Observability".
|
||||
|
||||
v0 has support for OpenTelemetry tracing:
|
||||
vLLM has support for OpenTelemetry tracing:
|
||||
|
||||
- Added by <https://github.com/vllm-project/vllm/pull/4687>
|
||||
- Added by <https://github.com/vllm-project/vllm/pull/4687> and reinstated by <https://github.com/vllm-project/vllm/pull/20372>
|
||||
- Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
|
||||
- [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/)
|
||||
- [User-facing docs](../examples/online_serving/opentelemetry.md)
|
||||
- [Blog post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
|
||||
- [IBM product docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview)
|
||||
|
||||
|
||||
OpenTelemetry has a
|
||||
[Gen AI Working Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md).
|
||||
|
||||
Since metrics is a big enough topic on its own, we are going to tackle
|
||||
the topic of tracing in v1 separately.
|
||||
Since metrics is a big enough topic on its own, we consider the topic
|
||||
of tracing to be quite separate from metrics.
|
||||
|
||||
### OpenTelemetry Model Forward vs Execute Time
|
||||
|
||||
In v0, we have the following two metrics:
|
||||
The current implementation exposes the following two metrics:
|
||||
|
||||
- `vllm:model_forward_time_milliseconds` (Histogram) - The time spent
|
||||
in the model forward pass when this request was in the batch.
|
||||
|
||||
@ -213,22 +213,22 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
|
||||
|
||||

|
||||
|
||||
**Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
|
||||
**Time 2: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
|
||||
|
||||

|
||||

|
||||
|
||||
**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
|
||||
**Time 3: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
|
||||
|
||||

|
||||

|
||||
|
||||
**Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
|
||||
**Time 4: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
|
||||
|
||||

|
||||

|
||||
|
||||
**Time 6: Request 1 is finished and free.**
|
||||
**Time 5: Request 1 is finished and free.**
|
||||
|
||||

|
||||

|
||||
|
||||
**Time 7: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
|
||||
**Time 6: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
|
||||
|
||||

|
||||

|
||||
|
||||
@ -52,7 +52,7 @@ th:not(:first-child) {
|
||||
| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](https://github.com/vllm-project/vllm/pull/4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
|
||||
| best-of | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ✅ | ✅ | | |
|
||||
| beam-search | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ❔ | ✅ | ✅ | |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](https://github.com/vllm-project/vllm/issues/25096) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
|
||||
|
||||
\* Chunked prefill and prefix caching are only applicable to last-token pooling.
|
||||
<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
|
||||
@ -75,4 +75,4 @@ th:not(:first-child) {
|
||||
| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅ | ❌ | ✅ |
|
||||
| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||
| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ? | [❌](https://github.com/vllm-project/vllm/issues/25097) | ✅ |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/25097) | ✅ |
|
||||
|
||||
@ -359,13 +359,19 @@ Full example: [examples/offline_inference/audio_language.py](../../examples/offl
|
||||
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
|
||||
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
|
||||
|
||||
You must enable this feature via `enable_mm_embeds=True`.
|
||||
|
||||
!!! warning
|
||||
The vLLM engine may crash if incorrect shape of embeddings is passed.
|
||||
Only enable this flag for trusted users!
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
# Inference with image embeddings as input
|
||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf", enable_mm_embeds=True)
|
||||
|
||||
# Refer to the HuggingFace repo for the correct format to use
|
||||
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
|
||||
@ -397,7 +403,11 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
|
||||
image_embeds = torch.load(...)
|
||||
|
||||
# Qwen2-VL
|
||||
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
|
||||
llm = LLM(
|
||||
"Qwen/Qwen2-VL-2B-Instruct",
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
enable_mm_embeds=True,
|
||||
)
|
||||
mm_data = {
|
||||
"image": {
|
||||
"image_embeds": image_embeds,
|
||||
@ -407,7 +417,12 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
|
||||
}
|
||||
|
||||
# MiniCPM-V
|
||||
llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
|
||||
llm = LLM(
|
||||
"openbmb/MiniCPM-V-2_6",
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
enable_mm_embeds=True,
|
||||
)
|
||||
mm_data = {
|
||||
"image": {
|
||||
"image_embeds": image_embeds,
|
||||
@ -732,7 +747,13 @@ Full example: [examples/online_serving/openai_chat_completion_client_for_multimo
|
||||
### Embedding Inputs
|
||||
|
||||
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
|
||||
pass a tensor of shape to the corresponding field of the multi-modal dictionary.
|
||||
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
|
||||
|
||||
You must enable this feature via the `--enable-mm-embeds` flag in `vllm serve`.
|
||||
|
||||
!!! warning
|
||||
The vLLM engine may crash if incorrect shape of embeddings is passed.
|
||||
Only enable this flag for trusted users!
|
||||
|
||||
#### Image Embedding Inputs
|
||||
|
||||
|
||||
@ -20,12 +20,16 @@ You can pass prompt embeddings from Hugging Face Transformers models to the `'p
|
||||
|
||||
## Online Serving
|
||||
|
||||
Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package.
|
||||
Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package and are enabled by the `--enable-prompt-embeds` flag in `vllm serve`.
|
||||
|
||||
When a mixture of `'prompt_embeds'` and `'prompt'` inputs are provided in a single request, the prompt embeds are always returned first.
|
||||
|
||||
Prompt embeddings are passed in as base64 encoded torch tensors.
|
||||
|
||||
!!! warning
|
||||
The vLLM engine may crash if incorrect shape of embeddings is passed.
|
||||
Only enable this flag for trusted users!
|
||||
|
||||
### Transformers Inputs via OpenAI Client
|
||||
|
||||
First, launch the OpenAI-compatible server:
|
||||
|
||||
@ -12,32 +12,56 @@ This guide will help you quickly get started with vLLM to perform:
|
||||
|
||||
## Installation
|
||||
|
||||
If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly.
|
||||
=== "NVIDIA CUDA"
|
||||
|
||||
It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
|
||||
If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly.
|
||||
|
||||
```bash
|
||||
uv venv --python 3.12 --seed
|
||||
source .venv/bin/activate
|
||||
uv pip install vllm --torch-backend=auto
|
||||
```
|
||||
It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
|
||||
|
||||
`uv` can [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`).
|
||||
```bash
|
||||
uv venv --python 3.12 --seed
|
||||
source .venv/bin/activate
|
||||
uv pip install vllm --torch-backend=auto
|
||||
```
|
||||
|
||||
Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating any permanent environment:
|
||||
`uv` can [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`).
|
||||
|
||||
```bash
|
||||
uv run --with vllm vllm --help
|
||||
```
|
||||
Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating any permanent environment:
|
||||
|
||||
You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. You can install `uv` to the conda environment through `pip` if you want to manage it within the environment.
|
||||
```bash
|
||||
uv run --with vllm vllm --help
|
||||
```
|
||||
|
||||
```bash
|
||||
conda create -n myenv python=3.12 -y
|
||||
conda activate myenv
|
||||
pip install --upgrade uv
|
||||
uv pip install vllm --torch-backend=auto
|
||||
```
|
||||
You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. You can install `uv` to the conda environment through `pip` if you want to manage it within the environment.
|
||||
|
||||
```bash
|
||||
conda create -n myenv python=3.12 -y
|
||||
conda activate myenv
|
||||
pip install --upgrade uv
|
||||
uv pip install vllm --torch-backend=auto
|
||||
```
|
||||
|
||||
=== "AMD ROCm"
|
||||
|
||||
Use a pre-built docker image from Docker Hub. The public stable image is [rocm/vllm:latest](https://hub.docker.com/r/rocm/vllm). There is also a development image at [rocm/vllm-dev](https://hub.docker.com/r/rocm/vllm-dev).
|
||||
|
||||
The `-v` flag in the `docker run` command below mounts a local directory into the container. Replace `<path/to/your/models>` with the path on your host machine to the directory containing your models. The models will then be accessible inside the container at `/app/models`.
|
||||
|
||||
???+ console "Commands"
|
||||
```bash
|
||||
docker pull rocm/vllm-dev:nightly # to get the latest image
|
||||
docker run -it --rm \
|
||||
--network=host \
|
||||
--group-add=video \
|
||||
--ipc=host \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--device /dev/kfd \
|
||||
--device /dev/dri \
|
||||
-v <path/to/your/models>:/app/models \
|
||||
-e HF_HOME="/app/models" \
|
||||
rocm/vllm-dev:nightly
|
||||
```
|
||||
|
||||
!!! note
|
||||
For more detail and non-CUDA platforms, please refer [here](installation/README.md) for specific instructions on how to install vLLM.
|
||||
@ -246,7 +270,17 @@ Alternatively, you can use the `openai` Python package:
|
||||
|
||||
Currently, vLLM supports multiple backends for efficient Attention computation across different platforms and accelerator architectures. It automatically selects the most performant backend compatible with your system and model specifications.
|
||||
|
||||
If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
|
||||
If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options:
|
||||
|
||||
- On NVIDIA CUDA: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
|
||||
- On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`.
|
||||
|
||||
For AMD ROCm, you can futher control the specific Attention implementation using the following variables:
|
||||
|
||||
- Triton Unified Attention: `VLLM_ROCM_USE_AITER=0 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
|
||||
- AITER Unified Attention: `VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
|
||||
- Triton Prefill-Decode Attention: `VLLM_ROCM_USE_AITER=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0`
|
||||
- AITER Multi-head Attention: `VLLM_ROCM_USE_AITER=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=1`
|
||||
|
||||
!!! warning
|
||||
There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [docker/Dockerfile](../../docker/Dockerfile) for instructions on how to install it.
|
||||
|
||||
@ -16,8 +16,8 @@
|
||||
| meta-llama/Llama-4-* | Llama4ForConditionalGeneration | ❌ |
|
||||
| microsoft/Phi-3-mini-128k-instruct | Phi3ForCausalLM | 🟨 |
|
||||
| microsoft/phi-4 | Phi3ForCausalLM | ❌ |
|
||||
| google/gemma-3-27b-it | TransformersForMultimodalLM | 🟨 |
|
||||
| google/gemma-3-4b-it | TransformersForMultimodalLM | ❌ |
|
||||
| google/gemma-3-27b-it | Gemma3ForConditionalGeneration | 🟨 |
|
||||
| google/gemma-3-4b-it | Gemma3ForConditionalGeneration | ❌ |
|
||||
| deepseek-ai/DeepSeek-R1 | DeepseekV3ForCausalLM | ❌ |
|
||||
| deepseek-ai/DeepSeek-V3 | DeepseekV3ForCausalLM | ❌ |
|
||||
| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 | LlamaForCausalLM | ✅ |
|
||||
|
||||
@ -374,8 +374,8 @@ th {
|
||||
| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ |
|
||||
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
|
||||
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ |
|
||||
| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | ✅︎ |
|
||||
| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ |
|
||||
| `HunYuanDenseV1ForCausalLM` | Hunyuan Dense | `tencent/Hunyuan-7B-Instruct` | ✅︎ | ✅︎ |
|
||||
| `HunYuanMoEV1ForCausalLM` | Hunyuan-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ |
|
||||
| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | |
|
||||
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ |
|
||||
@ -634,12 +634,15 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
|--------------|--------|--------|-------------------|----------------------|---------------------------|
|
||||
| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
|
||||
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ |
|
||||
| `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
|
||||
| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ |
|
||||
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |
|
||||
| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ |
|
||||
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ |
|
||||
| `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | | ✅︎ |
|
||||
| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ |
|
||||
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ |
|
||||
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
|
||||
| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ |
|
||||
| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ |
|
||||
@ -669,6 +672,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
|
||||
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
|
||||
| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
|
||||
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ |
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
|
||||
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
|
||||
@ -693,8 +697,6 @@ Some models are supported only via the [Transformers backend](#transformers). Th
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|
|
||||
| `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
|
||||
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | ✅︎ | ✅︎ |
|
||||
|
||||
<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
|
||||
• For example, to use DeepSeek-VL2 series models:
|
||||
@ -703,7 +705,21 @@ Some models are supported only via the [Transformers backend](#transformers). Th
|
||||
<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
|
||||
|
||||
!!! warning
|
||||
For `Gemma3ForConditionalGeneration`, `{"do_pan_and_scan": true}` is not supported in Transformers backend yet.
|
||||
Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
|
||||
However, there are differences in how they handle text + image inputs:
|
||||
|
||||
V0 correctly implements the model's attention pattern:
|
||||
- Uses bidirectional attention between the image tokens corresponding to the same image
|
||||
- Uses causal attention for other tokens
|
||||
- Implemented via (naive) PyTorch SDPA with masking tensors
|
||||
- Note: May use significant memory for long prompts with image
|
||||
|
||||
V1 currently uses a simplified attention pattern:
|
||||
- Uses causal attention for all tokens, including image tokens
|
||||
- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}`
|
||||
- Will be updated in the future to support the correct behavior
|
||||
|
||||
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
|
||||
|
||||
!!! note
|
||||
`Gemma3nForConditionalGeneration` is only supported on V1 due to shared KV caching and it depends on `timm>=1.0.17` to make use of its
|
||||
@ -755,6 +771,9 @@ Some models are supported only via the [Transformers backend](#transformers). Th
|
||||
The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
|
||||
For more details, please see: <https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630>
|
||||
|
||||
!!! warning
|
||||
Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
|
||||
|
||||
!!! note
|
||||
For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported.
|
||||
|
||||
@ -786,6 +805,7 @@ The following table lists those that are tested in vLLM.
|
||||
| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
|
||||
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
|
||||
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
|
||||
| `SiglipModel` | SigLIP | T / I | `google/siglip-base-patch16-224` | | |
|
||||
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))
|
||||
|
||||
@ -5,6 +5,7 @@ Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes
|
||||
The following open-source RL libraries use vLLM for fast rollouts (sorted alphabetically and non-exhaustive):
|
||||
|
||||
- [Cosmos-RL](https://github.com/nvidia-cosmos/cosmos-rl)
|
||||
- [ms-swift](https://github.com/modelscope/ms-swift/tree/main)
|
||||
- [NeMo-RL](https://github.com/NVIDIA-NeMo/RL)
|
||||
- [Open Instruct](https://github.com/allenai/open-instruct)
|
||||
- [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF)
|
||||
|
||||
@ -33,7 +33,7 @@ import os
|
||||
from time import sleep
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.utils import get_open_port
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
|
||||
def parse_args():
|
||||
|
||||
@ -49,6 +49,7 @@ class PrithviMAE:
|
||||
dtype="float16",
|
||||
enforce_eager=True,
|
||||
model_impl="terratorch",
|
||||
enable_mm_embeds=True,
|
||||
)
|
||||
|
||||
def run(self, input_data, location_coords):
|
||||
@ -63,7 +64,7 @@ class PrithviMAE:
|
||||
}
|
||||
|
||||
prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
|
||||
outputs = self.model.encode(prompt, use_tqdm=False)
|
||||
outputs = self.model.encode(prompt, pooling_task="plugin", use_tqdm=False)
|
||||
|
||||
return outputs[0].outputs.data
|
||||
|
||||
|
||||
@ -6,14 +6,14 @@ import os
|
||||
import torch
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.pooling_params import PoolingParams
|
||||
|
||||
# This example shows how to perform an offline inference that generates
|
||||
# multimodal data. In this specific case this example will take a geotiff
|
||||
# image as input, process it using the multimodal data processor, and
|
||||
# perform inference.
|
||||
# Requirement - install plugin at:
|
||||
# https://github.com/christian-pinto/prithvi_io_processor_plugin
|
||||
# Requirements:
|
||||
# - install TerraTorch v1.1 (or later):
|
||||
# pip install terratorch>=v1.1
|
||||
|
||||
|
||||
def main():
|
||||
@ -36,15 +36,12 @@ def main():
|
||||
# to avoid the model going OOM.
|
||||
# The maximum number depends on the available GPU memory
|
||||
max_num_seqs=32,
|
||||
io_processor_plugin="prithvi_to_tiff",
|
||||
io_processor_plugin="terratorch_segmentation",
|
||||
model_impl="terratorch",
|
||||
enable_mm_embeds=True,
|
||||
)
|
||||
|
||||
pooling_params = PoolingParams(task="token_classify", activation=False)
|
||||
pooler_output = llm.encode(
|
||||
img_prompt,
|
||||
pooling_params=pooling_params,
|
||||
)
|
||||
pooler_output = llm.encode(img_prompt, pooling_task="plugin")
|
||||
output = pooler_output[0].outputs
|
||||
|
||||
print(output)
|
||||
|
||||
@ -38,7 +38,7 @@ from rlhf_utils import stateless_init_process_group
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.utils import get_ip, get_open_port
|
||||
from vllm.utils.network_utils import get_ip, get_open_port
|
||||
|
||||
|
||||
class MyLLM(LLM):
|
||||
|
||||
@ -30,6 +30,7 @@ class ModelRequestData(NamedTuple):
|
||||
prompts: list[str]
|
||||
stop_token_ids: list[int] | None = None
|
||||
lora_requests: list[LoRARequest] | None = None
|
||||
sampling_params: list[SamplingParams] | None = None
|
||||
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
@ -90,6 +91,33 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# Bee-8B
|
||||
def run_bee(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
model_name = "Open-Bee/Bee-8B-RL"
|
||||
|
||||
prompts = [
|
||||
(
|
||||
f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||
f"<|im_start|>user\n<image>\n{question}<|im_end|>"
|
||||
f"<|im_start|>assistant\n<think>\n"
|
||||
)
|
||||
for question in questions
|
||||
]
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=16384,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# BLIP-2
|
||||
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -126,23 +154,6 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# Dots-OCR
|
||||
def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions]
|
||||
engine_args = EngineArgs(
|
||||
model="rednote-hilab/dots.ocr",
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
@ -190,6 +201,66 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_deepseek_ocr(questions: list[str], modality: str) -> ModelRequestData:
|
||||
from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
|
||||
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "deepseek-ai/DeepSeek-OCR"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
logits_processors=[NGramPerReqLogitsProcessor],
|
||||
)
|
||||
|
||||
# deepseek-ocr use plain prompt template
|
||||
prompts = [f"<image>\n{question}" for question in questions]
|
||||
|
||||
# The following sampling params config is taken from
|
||||
# the official Deepseek-OCR inference example.
|
||||
# (IMPORTANT) Use the custom logits processor and avoid skipping
|
||||
# special tokens for this model for the optimal OCR performance.
|
||||
sampling_params = [
|
||||
SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=8192,
|
||||
# ngram logit processor args
|
||||
extra_args=dict(
|
||||
ngram_size=30,
|
||||
window_size=90,
|
||||
# whitelist: <td>, </td>
|
||||
whitelist_token_ids={128821, 128822},
|
||||
),
|
||||
skip_special_tokens=False,
|
||||
)
|
||||
for _ in questions
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
|
||||
# Dots-OCR
|
||||
def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions]
|
||||
engine_args = EngineArgs(
|
||||
model="rednote-hilab/dots.ocr",
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Ernie4.5-VL
|
||||
def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT"
|
||||
@ -248,8 +319,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model=model_name,
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
# TODO: Support this in transformers backend
|
||||
# mm_processor_kwargs={"do_pan_and_scan": True},
|
||||
mm_processor_kwargs={"do_pan_and_scan": True},
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
@ -1708,11 +1778,13 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_example_map = {
|
||||
"aria": run_aria,
|
||||
"aya_vision": run_aya_vision,
|
||||
"bee": run_bee,
|
||||
"blip-2": run_blip2,
|
||||
"chameleon": run_chameleon,
|
||||
"dots_ocr": run_dots_ocr,
|
||||
"command_a_vision": run_command_a_vision,
|
||||
"deepseek_vl_v2": run_deepseek_vl2,
|
||||
"deepseek_ocr": run_deepseek_ocr,
|
||||
"dots_ocr": run_dots_ocr,
|
||||
"ernie45_vl": run_ernie45_vl,
|
||||
"fuyu": run_fuyu,
|
||||
"gemma3": run_gemma3,
|
||||
@ -1975,8 +2047,12 @@ def main(args):
|
||||
|
||||
# We set temperature to 0.2 so that outputs can be different
|
||||
# even when all prompts are identical when running batch inference.
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
|
||||
sampling_params = (
|
||||
SamplingParams(
|
||||
temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
|
||||
)
|
||||
if req_data.sampling_params is None
|
||||
else req_data.sampling_params
|
||||
)
|
||||
|
||||
assert args.num_prompts > 0
|
||||
|
||||
@ -44,6 +44,7 @@ class ModelRequestData(NamedTuple):
|
||||
stop_token_ids: list[int] | None = None
|
||||
chat_template: str | None = None
|
||||
lora_requests: list[LoRARequest] | None = None
|
||||
sampling_params: SamplingParams | None = None
|
||||
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
@ -107,6 +108,41 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_bee(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "Open-Bee/Bee-8B-RL"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=16384,
|
||||
max_num_seqs=16,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{"type": "text", "text": question},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
||||
|
||||
prompt = processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_command_a_vision(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "CohereLabs/command-a-vision-07-2025"
|
||||
|
||||
@ -166,6 +202,46 @@ def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_deepseek_ocr(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
|
||||
|
||||
model_name = "deepseek-ai/DeepSeek-OCR"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
logits_processors=[NGramPerReqLogitsProcessor],
|
||||
)
|
||||
|
||||
placeholder = "<image>\n" * len(image_urls)
|
||||
prompt = placeholder + question
|
||||
|
||||
# The following sampling params config is taken from
|
||||
# the official Deepseek-OCR inference example.
|
||||
# (IMPORTANT) Use the custom logits processor and avoid skipping
|
||||
# special tokens for this model for the optimal OCR performance.
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=8192,
|
||||
# ngram logit processor args
|
||||
extra_args=dict(
|
||||
ngram_size=30,
|
||||
window_size=90,
|
||||
# whitelist: <td>, </td>
|
||||
whitelist_token_ids={128821, 128822},
|
||||
),
|
||||
skip_special_tokens=False,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
|
||||
def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "google/gemma-3-4b-it"
|
||||
|
||||
@ -1215,8 +1291,10 @@ def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_example_map = {
|
||||
"aria": load_aria,
|
||||
"aya_vision": load_aya_vision,
|
||||
"bee": load_bee,
|
||||
"command_a_vision": load_command_a_vision,
|
||||
"deepseek_vl_v2": load_deepseek_vl2,
|
||||
"deepseek_ocr": load_deepseek_ocr,
|
||||
"gemma3": load_gemma3,
|
||||
"h2ovl_chat": load_h2ovl,
|
||||
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
|
||||
@ -1289,8 +1367,12 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: int | None)
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
|
||||
sampling_params = (
|
||||
SamplingParams(
|
||||
temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
|
||||
)
|
||||
if req_data.sampling_params is None
|
||||
else req_data.sampling_params
|
||||
)
|
||||
outputs = llm.chat(
|
||||
[
|
||||
|
||||
@ -110,6 +110,53 @@ def run_e5_v(query: Query) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_jinavl_reranker(query: Query) -> ModelRequestData:
|
||||
if query["modality"] != "text+images":
|
||||
raise ValueError(f"Unsupported query modality: '{query['modality']}'")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="jinaai/jina-reranker-m0",
|
||||
runner="pooling",
|
||||
max_model_len=32768,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs={
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 602112,
|
||||
},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
query=query["text"],
|
||||
documents=query["image"],
|
||||
)
|
||||
|
||||
|
||||
def run_siglip(query: Query) -> ModelRequestData:
|
||||
if query["modality"] == "text":
|
||||
prompt = query["text"]
|
||||
image = None
|
||||
elif query["modality"] == "image":
|
||||
prompt = "" # For image input, make sure that the prompt text is empty
|
||||
image = query["image"]
|
||||
else:
|
||||
modality = query["modality"]
|
||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="google/siglip-base-patch16-224",
|
||||
runner="pooling",
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
|
||||
|
||||
def _get_vlm2vec_prompt_image(query: Query, image_token: str):
|
||||
if query["modality"] == "text":
|
||||
text = query["text"]
|
||||
@ -211,29 +258,6 @@ def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_jinavl_reranker(query: Query) -> ModelRequestData:
|
||||
if query["modality"] != "text+images":
|
||||
raise ValueError(f"Unsupported query modality: '{query['modality']}'")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="jinaai/jina-reranker-m0",
|
||||
runner="pooling",
|
||||
max_model_len=32768,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs={
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 602112,
|
||||
},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
query=query["text"],
|
||||
documents=query["image"],
|
||||
)
|
||||
|
||||
|
||||
def get_query(modality: QueryModality):
|
||||
if modality == "text":
|
||||
return TextQuery(modality="text", text="A dog sitting in the grass")
|
||||
@ -328,9 +352,10 @@ def run_score(model: str, modality: QueryModality, seed: int | None):
|
||||
model_example_map = {
|
||||
"clip": run_clip,
|
||||
"e5_v": run_e5_v,
|
||||
"jinavl_reranker": run_jinavl_reranker,
|
||||
"siglip": run_siglip,
|
||||
"vlm2vec_phi3v": run_vlm2vec_phi3v,
|
||||
"vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
|
||||
"jinavl_reranker": run_jinavl_reranker,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -530,7 +530,7 @@ spec:
|
||||
name: accelerators-thanos-querier-datasource
|
||||
# Multiply by 100 so we can read it as a percentage without setting a unit (avoids CUE unit conflicts)
|
||||
query: >
|
||||
100 * avg(vllm:gpu_cache_usage_perc)
|
||||
100 * avg(vllm:kv_cache_usage_perc)
|
||||
|
||||
"18":
|
||||
kind: Panel
|
||||
|
||||
@ -98,7 +98,7 @@ spec:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: avg(vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
query: avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_running_ts:
|
||||
@ -168,7 +168,7 @@ spec:
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
# multiply by 100 to present percentage; omit format.unit to avoid schema conflicts
|
||||
query: (avg(vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
query: (avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_kv_usage_pct_ts:
|
||||
@ -187,7 +187,7 @@ spec:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: (avg by (service) (vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
query: (avg by (service) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
# --- Per-Pod breakdowns (works on Simulator & Real) ---
|
||||
@ -246,7 +246,7 @@ spec:
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
# if your exporter labels kv metric with pod (the sim does), this works; otherwise it will just return empty
|
||||
query: (avg by (pod) (vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
query: (avg by (pod) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
# --- Real vLLM only (zeros on simulator) ---
|
||||
|
||||
@ -6,10 +6,16 @@
|
||||
python examples/online_serving/pooling/cohere_rerank_client.py
|
||||
```
|
||||
|
||||
## Embedding embed_dtype usage
|
||||
## Embedding requests base64 encoding_format usage
|
||||
|
||||
```bash
|
||||
python examples/online_serving/pooling/embedding_embed_dtype_client.py
|
||||
python examples/online_serving/pooling/embedding_requests_base64_client.py
|
||||
```
|
||||
|
||||
## Embedding requests bytes encoding_format usage
|
||||
|
||||
```bash
|
||||
python examples/online_serving/pooling/embedding_requests_bytes_client.py
|
||||
```
|
||||
|
||||
## Jinaai rerank usage
|
||||
|
||||
@ -12,7 +12,11 @@ import base64
|
||||
import requests
|
||||
import torch
|
||||
|
||||
from vllm.entrypoints.openai.protocol import EMBED_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.utils.serial_utils import (
|
||||
EMBED_DTYPE_TO_TORCH_DTYPE,
|
||||
ENDIANNESS,
|
||||
binary2tensor,
|
||||
)
|
||||
|
||||
|
||||
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
|
||||
@ -34,24 +38,25 @@ def main(args):
|
||||
api_url = f"http://{args.host}:{args.port}/v1/embeddings"
|
||||
model_name = args.model
|
||||
|
||||
for embed_dtype, torch_dtype in EMBED_DTYPE_TO_TORCH_DTYPE.items():
|
||||
prompt = {
|
||||
"model": model_name,
|
||||
"input": "vLLM is great!",
|
||||
"encoding_format": "base64",
|
||||
"embed_dtype": embed_dtype,
|
||||
}
|
||||
response = post_http_request(prompt=prompt, api_url=api_url)
|
||||
# The OpenAI client does not support the embed_dtype and endianness parameters.
|
||||
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
|
||||
for endianness in ENDIANNESS:
|
||||
prompt = {
|
||||
"model": model_name,
|
||||
"input": "vLLM is great!",
|
||||
"encoding_format": "base64",
|
||||
"embed_dtype": embed_dtype,
|
||||
"endianness": endianness,
|
||||
}
|
||||
response = post_http_request(prompt=prompt, api_url=api_url)
|
||||
|
||||
embedding = []
|
||||
for data in response.json()["data"]:
|
||||
embedding.append(
|
||||
torch.frombuffer(
|
||||
base64.b64decode(data["embedding"]), dtype=torch_dtype
|
||||
).to(torch.float32)
|
||||
)
|
||||
embedding = torch.cat(embedding)
|
||||
print(embed_dtype, embedding.shape)
|
||||
embedding = []
|
||||
for data in response.json()["data"]:
|
||||
binary = base64.b64decode(data["embedding"])
|
||||
tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
|
||||
embedding.append(tensor.to(torch.float32))
|
||||
embedding = torch.cat(embedding)
|
||||
print(embed_dtype, endianness, embedding.shape)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -0,0 +1,66 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Example Python client for embedding API using vLLM API server
|
||||
NOTE:
|
||||
start a supported embeddings model server with `vllm serve`, e.g.
|
||||
vllm serve intfloat/e5-small
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
|
||||
import requests
|
||||
import torch
|
||||
|
||||
from vllm.utils.serial_utils import (
|
||||
EMBED_DTYPE_TO_TORCH_DTYPE,
|
||||
ENDIANNESS,
|
||||
MetadataItem,
|
||||
decode_pooling_output,
|
||||
)
|
||||
|
||||
|
||||
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
|
||||
headers = {"User-Agent": "Test Client"}
|
||||
response = requests.post(api_url, headers=headers, json=prompt)
|
||||
return response
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument("--model", type=str, default="intfloat/e5-small")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
api_url = f"http://{args.host}:{args.port}/v1/embeddings"
|
||||
model_name = args.model
|
||||
|
||||
# The OpenAI client does not support the bytes encoding_format.
|
||||
# The OpenAI client does not support the embed_dtype and endianness parameters.
|
||||
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
|
||||
for endianness in ENDIANNESS:
|
||||
prompt = {
|
||||
"model": model_name,
|
||||
"input": "vLLM is great!",
|
||||
"encoding_format": "bytes",
|
||||
"embed_dtype": embed_dtype,
|
||||
"endianness": endianness,
|
||||
}
|
||||
response = post_http_request(prompt=prompt, api_url=api_url)
|
||||
metadata = json.loads(response.headers["metadata"])
|
||||
body = response.content
|
||||
items = [MetadataItem(**x) for x in metadata["data"]]
|
||||
|
||||
embedding = decode_pooling_output(items=items, body=body)
|
||||
embedding = [x.to(torch.float32) for x in embedding]
|
||||
embedding = torch.cat(embedding)
|
||||
print(embed_dtype, endianness, embedding.shape)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@ -83,6 +83,109 @@ def run_clip(client: OpenAI, model: str):
|
||||
print("Text embedding output:", response.data[0].embedding)
|
||||
|
||||
|
||||
def run_dse_qwen2_vl(client: OpenAI, model: str):
|
||||
"""
|
||||
Start the server using:
|
||||
|
||||
vllm serve MrLight/dse-qwen2-2b-mrl-v1 \
|
||||
--runner pooling \
|
||||
--trust-remote-code \
|
||||
--max-model-len 8192 \
|
||||
--chat-template examples/template_dse_qwen2_vl.jinja
|
||||
"""
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What is shown in this image?"},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Image embedding output:", response.data[0].embedding)
|
||||
|
||||
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
|
||||
# of the minimum input size
|
||||
buffer = io.BytesIO()
|
||||
image_placeholder = Image.new("RGB", (56, 56))
|
||||
image_placeholder.save(buffer, "png")
|
||||
buffer.seek(0)
|
||||
image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{image_placeholder}",
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "Query: What is the weather like today?"},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Text embedding output:", response.data[0].embedding)
|
||||
|
||||
|
||||
def run_siglip(client: OpenAI, model: str):
|
||||
"""
|
||||
Start the server using:
|
||||
|
||||
vllm serve google/siglip-base-patch16-224 \
|
||||
--runner pooling
|
||||
"""
|
||||
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Image embedding output:", response.data[0].embedding)
|
||||
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "a photo of a cat"},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Text embedding output:", response.data[0].embedding)
|
||||
|
||||
|
||||
def run_vlm2vec(client: OpenAI, model: str):
|
||||
"""
|
||||
Start the server using:
|
||||
@ -148,72 +251,11 @@ def run_vlm2vec(client: OpenAI, model: str):
|
||||
print("Text embedding output:", response.data[0].embedding)
|
||||
|
||||
|
||||
def run_dse_qwen2_vl(client: OpenAI, model: str):
|
||||
"""
|
||||
Start the server using:
|
||||
|
||||
vllm serve MrLight/dse-qwen2-2b-mrl-v1 \
|
||||
--runner pooling \
|
||||
--trust-remote-code \
|
||||
--max-model-len 8192 \
|
||||
--chat-template examples/template_dse_qwen2_vl.jinja
|
||||
"""
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What is shown in this image?"},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Image embedding output:", response.data[0].embedding)
|
||||
|
||||
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
|
||||
# of the minimum input size
|
||||
buffer = io.BytesIO()
|
||||
image_placeholder = Image.new("RGB", (56, 56))
|
||||
image_placeholder.save(buffer, "png")
|
||||
buffer.seek(0)
|
||||
image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{image_placeholder}",
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "Query: What is the weather like today?"},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Text embedding output:", response.data[0].embedding)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"clip": run_clip,
|
||||
"vlm2vec": run_vlm2vec,
|
||||
"dse_qwen2_vl": run_dse_qwen2_vl,
|
||||
"siglip": run_siglip,
|
||||
"vlm2vec": run_vlm2vec,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -11,14 +11,15 @@ import requests
|
||||
# image as input, process it using the multimodal data processor, and
|
||||
# perform inference.
|
||||
# Requirements :
|
||||
# - install plugin at:
|
||||
# https://github.com/christian-pinto/prithvi_io_processor_plugin
|
||||
# - install TerraTorch v1.1 (or later):
|
||||
# pip install terratorch>=v1.1
|
||||
# - start vllm in serving mode with the below args
|
||||
# --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
|
||||
# --model-impl terratorch
|
||||
# --task embed --trust-remote-code
|
||||
# --skip-tokenizer-init --enforce-eager
|
||||
# --io-processor-plugin prithvi_to_tiff
|
||||
# --io-processor-plugin terratorch_segmentation
|
||||
# --enable-mm-embeds
|
||||
|
||||
|
||||
def main():
|
||||
@ -34,7 +35,6 @@ def main():
|
||||
},
|
||||
"priority": 0,
|
||||
"model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
|
||||
"softmax": False,
|
||||
}
|
||||
|
||||
ret = requests.post(server_endpoint, json=request_payload_url)
|
||||
|
||||
@ -852,7 +852,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}",
|
||||
"expr": "vllm:kv_cache_usage_perc{model_name=\"$model_name\"}",
|
||||
"instant": false,
|
||||
"legendFormat": "GPU Cache Usage",
|
||||
"range": true,
|
||||
|
||||
@ -6,7 +6,7 @@ requires = [
|
||||
"packaging>=24.2",
|
||||
"setuptools>=77.0.3,<80.0.0",
|
||||
"setuptools-scm>=8.0",
|
||||
"torch == 2.8.0",
|
||||
"torch == 2.9.0",
|
||||
"wheel",
|
||||
"jinja2",
|
||||
]
|
||||
|
||||
@ -4,7 +4,7 @@ ninja
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<80.0.0
|
||||
setuptools-scm>=8
|
||||
torch==2.8.0
|
||||
torch==2.9.0
|
||||
wheel
|
||||
jinja2>=3.1.6
|
||||
regex
|
||||
|
||||
@ -48,3 +48,4 @@ pybase64 # fast base64 implementation
|
||||
cbor2 # Required for cross-language serialization of hashable objects
|
||||
setproctitle # Used to set process names for better debugging and monitoring
|
||||
openai-harmony >= 0.0.3 # Required for gpt-oss
|
||||
anthropic == 0.71.0
|
||||
|
||||
@ -6,6 +6,7 @@ setuptools-scm>=8
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch==2.8.0+cpu; platform_machine == "x86_64"
|
||||
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
|
||||
scons; platform_machine == "aarch64" # needed to build Arm Compute Library (ACL)
|
||||
wheel
|
||||
jinja2>=3.1.6
|
||||
regex
|
||||
|
||||
@ -5,11 +5,13 @@ numba == 0.61.2 # Required for N-gram speculative decoding
|
||||
|
||||
# Dependencies for NVIDIA GPUs
|
||||
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
|
||||
torch==2.8.0
|
||||
torchaudio==2.8.0
|
||||
torch==2.9.0
|
||||
torchaudio==2.9.0
|
||||
# These must be updated alongside torch
|
||||
torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
||||
torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
||||
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
|
||||
xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8
|
||||
# xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8
|
||||
# FlashInfer should be updated together with the Dockerfile
|
||||
flashinfer-python==0.4.1
|
||||
flashinfer-python==0.4.1
|
||||
# Triton Kernels are needed for mxfp4 fused moe. (Should be updated alongside torch)
|
||||
triton_kernels @ git+https://github.com/triton-lang/triton.git@v3.5.0#subdirectory=python/triton_kernels
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
# Common dependencies
|
||||
-r common.txt
|
||||
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.3
|
||||
torch==2.8.0
|
||||
torchvision==0.23.0
|
||||
torchaudio==2.8.0
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.4
|
||||
torch==2.9.0
|
||||
torchvision==0.24.0
|
||||
torchaudio==2.9.0
|
||||
|
||||
triton==3.3.0
|
||||
triton==3.5.0
|
||||
cmake>=3.26.1,<4
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<80.0.0
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
# Common dependencies
|
||||
-r common.txt
|
||||
tblib==3.1.0
|
||||
bm25s==0.2.13
|
||||
pystemmer==3.0.0
|
||||
|
||||
# entrypoints test
|
||||
# librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
|
||||
@ -29,6 +31,8 @@ matplotlib==3.10.3
|
||||
# Multi-Modal Models Test (Extended) 3
|
||||
blobfile==3.0.0
|
||||
|
||||
schemathesis==3.39.15 # Required for openai schema test.
|
||||
# Required for openai schema test.
|
||||
schemathesis==3.39.15
|
||||
|
||||
mteb[bm25s]>=1.38.11, <2 # required for mteb test
|
||||
# required for mteb test
|
||||
mteb[bm25s]>=1.38.11, <2
|
||||
|
||||
@ -24,9 +24,9 @@ soundfile # required for audio tests
|
||||
jiwer # required for audio tests
|
||||
tblib # for pickling test exceptions
|
||||
timm >=1.0.17 # required for internvl and gemma3n-mm test
|
||||
torch==2.8.0
|
||||
torchaudio==2.8.0
|
||||
torchvision==0.23.0
|
||||
torch==2.9.0
|
||||
torchaudio==2.9.0
|
||||
torchvision==0.24.0
|
||||
transformers_stream_generator # required for qwen-vl test
|
||||
matplotlib # required for qwen-vl test
|
||||
mistral_common[image,audio] >= 1.8.5 # required for voxtral test
|
||||
@ -55,4 +55,4 @@ fastsafetensors>=0.1.10
|
||||
pydantic>=2.12 # 2.11 leads to error on python 3.13
|
||||
decord==0.6.0
|
||||
terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
|
||||
gpt-oss >= 0.0.7; python_version > '3.11'
|
||||
gpt-oss >= 0.0.7; python_version > '3.11'
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
# This file was autogenerated by uv via the following command:
|
||||
# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128 --python-platform x86_64-manylinux_2_28
|
||||
# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28
|
||||
absl-py==2.1.0
|
||||
# via rouge-score
|
||||
accelerate==1.0.1
|
||||
@ -573,42 +573,44 @@ numpy==1.26.4
|
||||
# tritonclient
|
||||
# vocos
|
||||
# xarray
|
||||
nvidia-cublas-cu12==12.8.4.1
|
||||
nvidia-cublas-cu12==12.9.1.4
|
||||
# via
|
||||
# nvidia-cudnn-cu12
|
||||
# nvidia-cusolver-cu12
|
||||
# torch
|
||||
nvidia-cuda-cupti-cu12==12.8.90
|
||||
nvidia-cuda-cupti-cu12==12.9.79
|
||||
# via torch
|
||||
nvidia-cuda-nvrtc-cu12==12.8.93
|
||||
nvidia-cuda-nvrtc-cu12==12.9.86
|
||||
# via torch
|
||||
nvidia-cuda-runtime-cu12==12.8.90
|
||||
nvidia-cuda-runtime-cu12==12.9.79
|
||||
# via torch
|
||||
nvidia-cudnn-cu12==9.10.2.21
|
||||
# via torch
|
||||
nvidia-cufft-cu12==11.3.3.83
|
||||
nvidia-cufft-cu12==11.4.1.4
|
||||
# via torch
|
||||
nvidia-cufile-cu12==1.13.1.3
|
||||
nvidia-cufile-cu12==1.14.1.1
|
||||
# via torch
|
||||
nvidia-curand-cu12==10.3.9.90
|
||||
nvidia-curand-cu12==10.3.10.19
|
||||
# via torch
|
||||
nvidia-cusolver-cu12==11.7.3.90
|
||||
nvidia-cusolver-cu12==11.7.5.82
|
||||
# via torch
|
||||
nvidia-cusparse-cu12==12.5.8.93
|
||||
nvidia-cusparse-cu12==12.5.10.65
|
||||
# via
|
||||
# nvidia-cusolver-cu12
|
||||
# torch
|
||||
nvidia-cusparselt-cu12==0.7.1
|
||||
# via torch
|
||||
nvidia-nccl-cu12==2.27.3
|
||||
nvidia-nccl-cu12==2.27.5
|
||||
# via torch
|
||||
nvidia-nvjitlink-cu12==12.8.93
|
||||
nvidia-nvjitlink-cu12==12.9.86
|
||||
# via
|
||||
# nvidia-cufft-cu12
|
||||
# nvidia-cusolver-cu12
|
||||
# nvidia-cusparse-cu12
|
||||
# torch
|
||||
nvidia-nvtx-cu12==12.8.90
|
||||
nvidia-nvshmem-cu12==3.3.20
|
||||
# via torch
|
||||
nvidia-nvtx-cu12==12.9.79
|
||||
# via torch
|
||||
omegaconf==2.3.0
|
||||
# via
|
||||
@ -1017,7 +1019,6 @@ setuptools==77.0.3
|
||||
# lightning-utilities
|
||||
# pytablewriter
|
||||
# torch
|
||||
# triton
|
||||
shapely==2.1.1
|
||||
# via
|
||||
# geopandas
|
||||
@ -1122,7 +1123,7 @@ tomli==2.2.1
|
||||
# via schemathesis
|
||||
tomli-w==1.2.0
|
||||
# via schemathesis
|
||||
torch==2.8.0+cu128
|
||||
torch==2.9.0+cu129
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# accelerate
|
||||
@ -1151,7 +1152,7 @@ torch==2.8.0+cu128
|
||||
# torchvision
|
||||
# vector-quantize-pytorch
|
||||
# vocos
|
||||
torchaudio==2.8.0+cu128
|
||||
torchaudio==2.9.0+cu129
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# encodec
|
||||
@ -1164,7 +1165,7 @@ torchmetrics==1.7.4
|
||||
# pytorch-lightning
|
||||
# terratorch
|
||||
# torchgeo
|
||||
torchvision==0.23.0+cu128
|
||||
torchvision==0.24.0+cu129
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# lightly
|
||||
@ -1205,7 +1206,7 @@ transformers==4.56.2
|
||||
# transformers-stream-generator
|
||||
transformers-stream-generator==0.0.5
|
||||
# via -r requirements/test.in
|
||||
triton==3.4.0
|
||||
triton==3.5.0
|
||||
# via torch
|
||||
tritonclient==2.51.0
|
||||
# via
|
||||
|
||||
@ -157,11 +157,9 @@ def test_models_distributed(
|
||||
and distributed_executor_backend == "ray"
|
||||
and attention_backend == ""
|
||||
and test_suite == "L4"
|
||||
and enable_prompt_embeds
|
||||
): # noqa
|
||||
if enable_prompt_embeds:
|
||||
pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")
|
||||
monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
|
||||
monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
|
||||
pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")
|
||||
|
||||
if attention_backend:
|
||||
monkeypatch_context.setenv(
|
||||
|
||||
@ -355,13 +355,13 @@ def test_toy_llama(
|
||||
)
|
||||
|
||||
compile_config_no_compile = CompilationConfig(
|
||||
level=CompilationMode.NONE,
|
||||
mode=CompilationMode.NONE,
|
||||
cudagraph_mode=CUDAGraphMode.NONE,
|
||||
backend="eager",
|
||||
)
|
||||
|
||||
compile_config_no_split = CompilationConfig(
|
||||
level=CompilationMode.VLLM_COMPILE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||
backend=backend,
|
||||
|
||||
@ -38,7 +38,7 @@ class CompiledMod(torch.nn.Module):
|
||||
def make_vllm_config() -> VllmConfig:
|
||||
return VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationMode.VLLM_COMPILE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@ -25,7 +25,7 @@ from vllm.distributed.parallel_state import (
|
||||
initialize_model_parallel,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import update_environment_variables
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
from ..models.registry import HF_EXAMPLE_MODELS
|
||||
from ..utils import (
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import copy
|
||||
from contextlib import nullcontext
|
||||
|
||||
import pytest
|
||||
|
||||
@ -8,6 +9,8 @@ from vllm.compilation.counter import compilation_counter
|
||||
from vllm.compilation.fix_functionalization import FixFunctionalizationPass
|
||||
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
|
||||
from vllm.config.compilation import CompilationMode
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
|
||||
|
||||
|
||||
@ -168,7 +171,7 @@ def test_splitting_ops_dynamic():
|
||||
if is_torch_equal_or_newer("2.9.0.dev"):
|
||||
config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationMode.VLLM_COMPILE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
use_inductor_graph_partition=True,
|
||||
splitting_ops=["vllm::unified_attention"],
|
||||
)
|
||||
@ -180,7 +183,7 @@ def test_splitting_ops_dynamic():
|
||||
# When attn_fusion pass enabled, splitting_ops now default to attention ops.
|
||||
config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationMode.VLLM_COMPILE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
pass_config={"enable_attn_fusion": True, "enable_noop": True},
|
||||
custom_ops=["+quant_fp8"],
|
||||
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||
@ -195,7 +198,7 @@ def test_splitting_ops_dynamic():
|
||||
if is_torch_equal_or_newer("2.9.0.dev"):
|
||||
config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationMode.VLLM_COMPILE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
use_inductor_graph_partition=True,
|
||||
pass_config={"enable_attn_fusion": True, "enable_noop": True},
|
||||
custom_ops=["+quant_fp8"],
|
||||
@ -233,3 +236,73 @@ def test_resolve_operator_overload():
|
||||
assert len(resolved) == 2 # Only 2 valid ops
|
||||
assert resolved[0] is torch.ops.aten.mm.default
|
||||
assert resolved[1] is torch.ops.aten.addmm.default
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.support_static_graph_mode(),
|
||||
reason="Skip if not cudagraph mode supported",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
(
|
||||
"cudagraph_capture_sizes",
|
||||
"max_cudagraph_capture_size",
|
||||
"tp_size",
|
||||
"enable_sequence_parallelism",
|
||||
"max_num_batched_tokens",
|
||||
"use_cudagraph",
|
||||
"expected_max_size",
|
||||
),
|
||||
[
|
||||
(None, None, 1, False, 2048, True, 512),
|
||||
([1, 2, 4], 4, 1, False, 2048, True, 4),
|
||||
([1, 2, 4], 8, 1, False, 2048, True, RuntimeError),
|
||||
([1, 256], None, 1, False, 2048, 256),
|
||||
([], None, 1, False, 2048, False, 0),
|
||||
(None, 0, 1, False, 2048, False, 0),
|
||||
# truncated to nearest multiple of 8 or 16
|
||||
(None, 257, 1, False, 2048, True, 256),
|
||||
([1, 2, 4, 15], None, 1, False, 2048, True, 15), # max from list
|
||||
([1, 2, 4, 15], None, 2, True, 2048, True, 4), # filtered out 15 due to SP
|
||||
([1, 2, 4, 15], None, 1, False, 8, True, 4), # limited by the max_tokens
|
||||
# the list should contain at least 1 element when use cudagraph
|
||||
([], None, 1, False, 2048, True, RuntimeError),
|
||||
# the max capturing size should be >= 1 when use cudagraph
|
||||
(None, 0, 1, False, 2048, True, RuntimeError),
|
||||
],
|
||||
)
|
||||
def test_cudagraph_sizes_post_init(
|
||||
cudagraph_capture_sizes,
|
||||
max_cudagraph_capture_size,
|
||||
tp_size,
|
||||
enable_sequence_parallelism,
|
||||
max_num_batched_tokens,
|
||||
use_cudagraph,
|
||||
expected_max_size,
|
||||
):
|
||||
ctx = nullcontext()
|
||||
if isinstance(expected_max_size, Exception):
|
||||
ctx = pytest.raises(expected_max_size)
|
||||
|
||||
cudagraph_mode = CUDAGraphMode.PIECEWISE if use_cudagraph else CUDAGraphMode.NONE
|
||||
with ctx:
|
||||
compilation_config = CompilationConfig(
|
||||
cudagraph_capture_sizes=cudagraph_capture_sizes,
|
||||
max_cudagraph_capture_size=max_cudagraph_capture_size,
|
||||
pass_config={
|
||||
"enable_sequence_parallelism": enable_sequence_parallelism,
|
||||
"enable_fusion": True,
|
||||
"enable_noop": True,
|
||||
},
|
||||
cudagraph_mode=cudagraph_mode,
|
||||
)
|
||||
engine_args = EngineArgs(
|
||||
model="facebook/opt-125m",
|
||||
tensor_parallel_size=tp_size,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
compilation_config=compilation_config,
|
||||
)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
|
||||
assert (
|
||||
vllm_config.compilation_config.max_cudagraph_capture_size == expected_max_size
|
||||
)
|
||||
|
||||
@ -198,7 +198,7 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
|
||||
compilation_config = (
|
||||
compile_config
|
||||
if isinstance(compile_config, CompilationConfig)
|
||||
else CompilationConfig(level=compile_config)
|
||||
else CompilationConfig(mode=compile_config)
|
||||
)
|
||||
|
||||
prompts = [
|
||||
|
||||
@ -31,7 +31,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
GroupShape,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import update_environment_variables
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
from ..utils import has_module_attribute, multi_gpu_test
|
||||
from .backend import TestBackend
|
||||
|
||||
@ -132,6 +132,9 @@ def test_attn_quant(
|
||||
mode = CUDAGraphMode.FULL_AND_PIECEWISE
|
||||
splitting_ops: list[str] | None = None
|
||||
else:
|
||||
# FIXME: Llama-4-Scout-17B-16E-Instruct-FP8 + FlashInfer + Blackwell end at
|
||||
# CUDAGraphMode.NONE here because it derives an attention backend that
|
||||
# does not support full cudagraphs
|
||||
mode = CUDAGraphMode.FULL_DECODE_ONLY
|
||||
splitting_ops = []
|
||||
|
||||
@ -151,7 +154,7 @@ def test_attn_quant(
|
||||
cudagraph_mode=mode,
|
||||
splitting_ops=splitting_ops,
|
||||
# Common
|
||||
level=CompilationMode.VLLM_COMPILE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True),
|
||||
# Inductor caches custom passes by default as well via uuid
|
||||
inductor_compile_config={"force_disable_caches": True},
|
||||
@ -236,7 +239,7 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
|
||||
custom_ops=custom_ops_list,
|
||||
splitting_ops=splitting_ops,
|
||||
# Common
|
||||
level=CompilationMode.VLLM_COMPILE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
pass_config=PassConfig(
|
||||
enable_attn_fusion=True,
|
||||
enable_noop=True,
|
||||
@ -273,7 +276,7 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
|
||||
compilation_config = (
|
||||
compile_config
|
||||
if isinstance(compile_config, CompilationConfig)
|
||||
else CompilationConfig(level=compile_config)
|
||||
else CompilationConfig(mode=compile_config)
|
||||
)
|
||||
|
||||
prompts = [
|
||||
|
||||
@ -29,7 +29,7 @@ from vllm.distributed.parallel_state import (
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import update_environment_variables
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
from .backend import TestBackend
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import cast
|
||||
import itertools
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -16,7 +16,13 @@ from vllm.compilation.activation_quant_fusion import (
|
||||
from vllm.compilation.fusion import QUANT_OPS
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||
from vllm.config import CompilationConfig, PassConfig, VllmConfig
|
||||
from vllm.config import (
|
||||
CompilationConfig,
|
||||
CompilationMode,
|
||||
PassConfig,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.model_executor.layers.activation import SiluAndMul
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
GroupShape,
|
||||
@ -25,7 +31,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
Fp8LinearOp,
|
||||
cutlass_fp8_supported,
|
||||
maybe_create_device_identity,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
@ -54,6 +60,8 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
|
||||
act_quant_static=True,
|
||||
act_quant_group_shape=GroupShape.PER_TENSOR,
|
||||
)
|
||||
self.enable_silu_mul_custom_op = self.silu_and_mul.enabled()
|
||||
self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
|
||||
|
||||
def forward(self, x):
|
||||
y = self.silu_and_mul(x)
|
||||
@ -61,7 +69,14 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
|
||||
return x2
|
||||
|
||||
def ops_in_model_before(self):
|
||||
return [SILU_MUL_OP, QUANT_OPS[kFp8StaticTensorSym]]
|
||||
return [
|
||||
SILU_MUL_OP if self.enable_silu_mul_custom_op else torch.ops.aten.mul,
|
||||
(
|
||||
QUANT_OPS[kFp8StaticTensorSym]
|
||||
if self.enable_quant_fp8_custom_op
|
||||
else torch.ops.aten.reciprocal
|
||||
),
|
||||
]
|
||||
|
||||
def ops_in_model_after(self):
|
||||
return [FUSED_OPS[kFp8StaticTensorSym]]
|
||||
@ -77,6 +92,7 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
|
||||
assert silu_and_mul_nvfp4_quant_supported
|
||||
|
||||
self.silu_and_mul = SiluAndMul()
|
||||
self.enable_silu_mul_custom_op = self.silu_and_mul.enabled()
|
||||
|
||||
# create nvfp4 weight
|
||||
w = torch.rand((hidden_size, hidden_size))
|
||||
@ -101,7 +117,10 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
|
||||
return out
|
||||
|
||||
def ops_in_model_before(self):
|
||||
return [SILU_MUL_OP, QUANT_OPS[kNvfp4Quant]]
|
||||
return [
|
||||
SILU_MUL_OP if self.enable_silu_mul_custom_op else torch.ops.aten.mul,
|
||||
QUANT_OPS[kNvfp4Quant],
|
||||
]
|
||||
|
||||
def ops_in_model_after(self):
|
||||
return [FUSED_OPS[kNvfp4Quant]]
|
||||
@ -110,67 +129,80 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
|
||||
@pytest.mark.parametrize("num_tokens", [32, 64])
|
||||
@pytest.mark.parametrize("hidden_size", [128, 256])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
|
||||
@pytest.mark.parametrize("enable_silu_mul_custom_op", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"model_class",
|
||||
cast(
|
||||
list[type],
|
||||
[TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
|
||||
if is_nvfp4_supported()
|
||||
else [TestSiluMulFp8QuantModel],
|
||||
),
|
||||
"model_class, enable_quant_fp8_custom_op, cuda_force_torch",
|
||||
list(itertools.product([TestSiluMulFp8QuantModel], [True, False], [True, False]))
|
||||
+ [(TestSiluMulNvfp4QuantModel, False, False)],
|
||||
)
|
||||
# cuda_force_torch used to test torch code path on platforms that
|
||||
# cutlass_fp8_supported() == True.
|
||||
@pytest.mark.parametrize(
|
||||
"cuda_force_torch", [True, False] if cutlass_fp8_supported() else [True]
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm"
|
||||
)
|
||||
def test_fusion_silu_and_mul_quant(
|
||||
num_tokens, hidden_size, dtype, model_class, cuda_force_torch
|
||||
num_tokens: int,
|
||||
hidden_size: int,
|
||||
dtype: torch.dtype,
|
||||
model_class: type[TestSiluMulFp8QuantModel | TestSiluMulNvfp4QuantModel],
|
||||
enable_silu_mul_custom_op: bool,
|
||||
enable_quant_fp8_custom_op: bool,
|
||||
cuda_force_torch: bool,
|
||||
):
|
||||
if model_class == TestSiluMulNvfp4QuantModel and cuda_force_torch:
|
||||
pytest.skip("Duplicate tests for NVFP4")
|
||||
if model_class is TestSiluMulNvfp4QuantModel and not is_nvfp4_supported():
|
||||
pytest.skip("NVFP4 is not supported on this GPU.")
|
||||
|
||||
torch.set_default_device("cuda")
|
||||
torch.set_default_dtype(dtype)
|
||||
maybe_create_device_identity()
|
||||
|
||||
x = torch.rand(num_tokens, hidden_size * 2)
|
||||
|
||||
# Reshape pass is needed for the fusion pass to work
|
||||
config = VllmConfig()
|
||||
config.compilation_config = CompilationConfig(
|
||||
pass_config=PassConfig(enable_fusion=True, enable_noop=True)
|
||||
)
|
||||
fusion_pass = ActivationQuantFusionPass(config)
|
||||
|
||||
passes = [NoOpEliminationPass(config), fusion_pass, PostCleanupPass(config)]
|
||||
backend = TestBackend(*passes)
|
||||
model = model_class(hidden_size=hidden_size, cuda_force_torch=cuda_force_torch, x=x)
|
||||
|
||||
# First dimension dynamic
|
||||
torch._dynamo.mark_dynamic(x, 0)
|
||||
|
||||
result = model(x)
|
||||
|
||||
model2 = torch.compile(model, backend=backend)
|
||||
result2 = model2(x)
|
||||
|
||||
# Check that it gives the same answer
|
||||
if model_class == TestSiluMulFp8QuantModel:
|
||||
atol, rtol = 1e-3, 1e-3
|
||||
elif model_class == TestSiluMulNvfp4QuantModel:
|
||||
atol, rtol = 1e-1, 1e-1
|
||||
|
||||
torch.testing.assert_close(
|
||||
result[0].to(dtype=dtype), result2[0].to(dtype=dtype), atol=atol, rtol=rtol
|
||||
custom_ops = []
|
||||
if enable_silu_mul_custom_op:
|
||||
custom_ops.append("+silu_and_mul")
|
||||
if enable_quant_fp8_custom_op:
|
||||
custom_ops.append("+quant_fp8")
|
||||
config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
custom_ops=custom_ops,
|
||||
pass_config=PassConfig(enable_fusion=True, enable_noop=True),
|
||||
),
|
||||
)
|
||||
|
||||
assert fusion_pass.matched_count == 1
|
||||
with set_current_vllm_config(config):
|
||||
fusion_pass = ActivationQuantFusionPass(config)
|
||||
|
||||
# In pre-nodes, quant op should be present and fused kernels should not
|
||||
backend.check_before_ops(model.ops_in_model_before())
|
||||
passes = [NoOpEliminationPass(config), fusion_pass, PostCleanupPass(config)]
|
||||
backend = TestBackend(*passes)
|
||||
model = model_class(
|
||||
hidden_size=hidden_size, cuda_force_torch=cuda_force_torch, x=x
|
||||
)
|
||||
|
||||
# In post-nodes, fused kernels should be present and quant op should not
|
||||
backend.check_after_ops(model.ops_in_model_after())
|
||||
# First dimension dynamic
|
||||
torch._dynamo.mark_dynamic(x, 0)
|
||||
|
||||
result = model(x)
|
||||
|
||||
model2 = torch.compile(model, backend=backend)
|
||||
result2 = model2(x)
|
||||
|
||||
# Check that it gives the same answer
|
||||
if model_class == TestSiluMulFp8QuantModel:
|
||||
atol, rtol = 1e-3, 1e-3
|
||||
elif model_class == TestSiluMulNvfp4QuantModel:
|
||||
atol, rtol = 1e-1, 1e-1
|
||||
|
||||
torch.testing.assert_close(
|
||||
result[0].to(dtype=dtype), result2[0].to(dtype=dtype), atol=atol, rtol=rtol
|
||||
)
|
||||
|
||||
assert fusion_pass.matched_count == 1
|
||||
|
||||
# In pre-nodes, quant op should be present and fused kernels should not
|
||||
backend.check_before_ops(model.ops_in_model_before())
|
||||
|
||||
# In post-nodes, fused kernels should be present and quant op should not
|
||||
backend.check_after_ops(model.ops_in_model_after())
|
||||
|
||||
25
tests/config/test_multimodal_config.py
Normal file
25
tests/config/test_multimodal_config.py
Normal file
@ -0,0 +1,25 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.config.multimodal import MultiModalConfig
|
||||
|
||||
|
||||
def test_mm_encoder_attn_backend_str_conversion():
|
||||
config = MultiModalConfig(mm_encoder_attn_backend="FLASH_ATTN")
|
||||
assert config.mm_encoder_attn_backend == _Backend.FLASH_ATTN
|
||||
|
||||
|
||||
def test_mm_encoder_attn_backend_invalid():
|
||||
with pytest.raises(ValueError):
|
||||
MultiModalConfig(mm_encoder_attn_backend="not_a_backend")
|
||||
|
||||
|
||||
def test_mm_encoder_attn_backend_hash_updates():
|
||||
base_hash = MultiModalConfig().compute_hash()
|
||||
overridden_hash = MultiModalConfig(
|
||||
mm_encoder_attn_backend=_Backend.FLASH_ATTN
|
||||
).compute_hash()
|
||||
assert base_hash != overridden_hash
|
||||
@ -60,7 +60,7 @@ from vllm.multimodal.utils import fetch_image
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
from vllm.transformers_utils.utils import maybe_model_redirect
|
||||
from vllm.utils.collections import is_list_of
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
from vllm.utils.torch_utils import set_default_torch_num_threads
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -15,7 +15,7 @@ from vllm.distributed.parallel_state import (
|
||||
get_tp_group,
|
||||
init_distributed_environment,
|
||||
)
|
||||
from vllm.utils import update_environment_variables
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
|
||||
def distributed_run(fn, world_size):
|
||||
|
||||
@ -263,3 +263,52 @@ def test_data_parallel_rank_tagging(publisher_config):
|
||||
pub_1.shutdown()
|
||||
sub_0.close()
|
||||
sub_1.close()
|
||||
|
||||
|
||||
def test_event_publisher_factory():
|
||||
"""Test event publisher factory creation behavior under different configurations"""
|
||||
from vllm.config.kv_events import KVEventsConfig
|
||||
from vllm.distributed.kv_events import ZmqEventPublisher
|
||||
|
||||
# test config is None
|
||||
publisher = EventPublisherFactory.create(None, DP_RANK)
|
||||
assert isinstance(publisher, NullEventPublisher)
|
||||
publisher.shutdown()
|
||||
|
||||
# test disable kv cache events
|
||||
config = KVEventsConfig(
|
||||
enable_kv_cache_events=False,
|
||||
publisher="zmq", # Even if zmq is specified, should return NullEventPublisher
|
||||
endpoint="tcp://localhost:5557",
|
||||
)
|
||||
publisher = EventPublisherFactory.create(config, DP_RANK)
|
||||
assert isinstance(publisher, NullEventPublisher)
|
||||
publisher.shutdown()
|
||||
|
||||
# test zmq publisher
|
||||
config = KVEventsConfig(
|
||||
enable_kv_cache_events=True,
|
||||
publisher="zmq",
|
||||
endpoint="inproc://test-factory-true",
|
||||
)
|
||||
publisher = EventPublisherFactory.create(config, DP_RANK)
|
||||
assert isinstance(publisher, ZmqEventPublisher)
|
||||
publisher.shutdown()
|
||||
|
||||
# test unknown publisher
|
||||
with pytest.raises(ValueError, match="Input should be"):
|
||||
KVEventsConfig(
|
||||
enable_kv_cache_events=True,
|
||||
publisher="unknown_publisher",
|
||||
endpoint="tcp://localhost:5557",
|
||||
)
|
||||
|
||||
# test publisher not specified
|
||||
config = KVEventsConfig(
|
||||
enable_kv_cache_events=True,
|
||||
# publisher not specified, should default to "zmq"
|
||||
endpoint="tcp://localhost:5557",
|
||||
)
|
||||
publisher = EventPublisherFactory.create(config, DP_RANK)
|
||||
assert isinstance(publisher, ZmqEventPublisher)
|
||||
publisher.shutdown()
|
||||
|
||||
@ -18,8 +18,8 @@ from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
||||
|
||||
from vllm import initialize_ray_cluster
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.executor.ray_utils import _wait_until_pg_removed
|
||||
from vllm.utils import get_ip
|
||||
from vllm.utils.network_utils import get_ip
|
||||
from vllm.v1.executor.ray_utils import _wait_until_pg_removed
|
||||
|
||||
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
||||
|
||||
|
||||
@ -23,7 +23,7 @@ from vllm.distributed.parallel_state import (
|
||||
initialize_model_parallel,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import update_environment_variables
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
torch.manual_seed(42)
|
||||
random.seed(44)
|
||||
|
||||
@ -7,7 +7,7 @@ import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.parallel_state import _node_count
|
||||
from vllm.distributed.utils import StatelessProcessGroup
|
||||
from vllm.utils import get_ip, get_open_port
|
||||
from vllm.utils.network_utils import get_ip, get_open_port
|
||||
|
||||
if __name__ == "__main__":
|
||||
dist.init_process_group(backend="gloo")
|
||||
|
||||
@ -244,7 +244,7 @@ def _compare_tp(
|
||||
tokenizer_mode = model_info.tokenizer_mode
|
||||
hf_overrides = model_info.hf_overrides
|
||||
hf_config = get_config(model_id, trust_remote_code)
|
||||
skip_tokenizer_init = model_info.skip_tokenizer_init
|
||||
require_embed_inputs = model_info.require_embed_inputs
|
||||
max_num_seqs = model_info.max_num_seqs
|
||||
|
||||
dtype = "float16"
|
||||
@ -299,16 +299,20 @@ def _compare_tp(
|
||||
common_args.extend(["--load-format", load_format])
|
||||
if hf_overrides:
|
||||
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
|
||||
if skip_tokenizer_init:
|
||||
common_args.append("--skip-tokenizer-init")
|
||||
if require_embed_inputs:
|
||||
common_args.extend(
|
||||
[
|
||||
"--skip-tokenizer-init",
|
||||
"--enable-prompt-embeds",
|
||||
"--enable-mm-embeds",
|
||||
]
|
||||
)
|
||||
if max_num_seqs:
|
||||
common_args.extend(["--max-num-seqs", f"{max_num_seqs}"])
|
||||
|
||||
if distributed_backend == "ray":
|
||||
# For V1, test Ray Compiled Graph for all the tests
|
||||
# Test Ray Compiled Graph for all the tests
|
||||
pp_env = {
|
||||
"VLLM_USE_RAY_COMPILED_DAG": "1",
|
||||
"VLLM_USE_RAY_SPMD_WORKER": "1",
|
||||
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
|
||||
}
|
||||
# Temporary. Currently when zeromq + SPMD is used, it does not properly
|
||||
|
||||
@ -18,7 +18,7 @@ from vllm.distributed.parallel_state import (
|
||||
graph_capture,
|
||||
init_distributed_environment,
|
||||
)
|
||||
from vllm.utils import update_environment_variables
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
|
||||
def distributed_run(fn, world_size):
|
||||
|
||||
@ -3,11 +3,24 @@
|
||||
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.parallel_state import in_the_same_node_as
|
||||
from vllm.distributed.utils import StatelessProcessGroup
|
||||
from vllm.utils import get_ip, get_open_port
|
||||
from vllm.utils.network_utils import get_ip, get_open_port
|
||||
|
||||
|
||||
def _run_test(pg):
|
||||
test_result = all(in_the_same_node_as(pg, source_rank=0))
|
||||
|
||||
expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
|
||||
assert test_result == expected, f"Expected {expected}, got {test_result}"
|
||||
if pg == dist.group.WORLD:
|
||||
print("Same node test passed! when using torch distributed!")
|
||||
else:
|
||||
print("Same node test passed! when using StatelessProcessGroup!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dist.init_process_group(backend="gloo")
|
||||
@ -25,11 +38,12 @@ if __name__ == "__main__":
|
||||
stateless_pg = StatelessProcessGroup.create(ip, port, rank, dist.get_world_size())
|
||||
|
||||
for pg in [dist.group.WORLD, stateless_pg]:
|
||||
test_result = all(in_the_same_node_as(pg, source_rank=0))
|
||||
|
||||
expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
|
||||
assert test_result == expected, f"Expected {expected}, got {test_result}"
|
||||
if pg == dist.group.WORLD:
|
||||
print("Same node test passed! when using torch distributed!")
|
||||
if os.environ.get("VLLM_TEST_WITH_DEFAULT_DEVICE_SET", "0") == "1":
|
||||
default_devices = ["cpu"]
|
||||
if torch.cuda.is_available():
|
||||
default_devices.append("cuda")
|
||||
for device in default_devices:
|
||||
torch.set_default_device(device)
|
||||
_run_test(pg)
|
||||
else:
|
||||
print("Same node test passed! when using StatelessProcessGroup!")
|
||||
_run_test(pg)
|
||||
|
||||
@ -181,7 +181,7 @@ def _compare_sp(
|
||||
trust_remote_code = model_info.trust_remote_code
|
||||
tokenizer_mode = model_info.tokenizer_mode
|
||||
hf_overrides = model_info.hf_overrides
|
||||
skip_tokenizer_init = model_info.skip_tokenizer_init
|
||||
require_embed_inputs = model_info.require_embed_inputs
|
||||
|
||||
if load_format == "dummy":
|
||||
# Avoid OOM
|
||||
@ -233,8 +233,14 @@ def _compare_sp(
|
||||
common_args.extend(["--load-format", load_format])
|
||||
if hf_overrides:
|
||||
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
|
||||
if skip_tokenizer_init:
|
||||
common_args.append("--skip-tokenizer-init")
|
||||
if require_embed_inputs:
|
||||
common_args.extend(
|
||||
[
|
||||
"--skip-tokenizer-init",
|
||||
"--enable-prompt-embeds",
|
||||
"--enable-mm-embeds",
|
||||
]
|
||||
)
|
||||
|
||||
compilation_config = {
|
||||
"mode": CompilationMode.VLLM_COMPILE,
|
||||
|
||||
@ -10,7 +10,8 @@ import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
|
||||
from vllm.distributed.utils import StatelessProcessGroup
|
||||
from vllm.utils import get_open_port, update_environment_variables
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
|
||||
def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]:
|
||||
|
||||
@ -23,7 +23,7 @@ from vllm.distributed.parallel_state import (
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import update_environment_variables
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
torch.manual_seed(42)
|
||||
random.seed(44)
|
||||
|
||||
@ -10,10 +10,8 @@ import torch
|
||||
import vllm.envs as envs
|
||||
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
||||
from vllm.distributed.utils import StatelessProcessGroup
|
||||
from vllm.utils import (
|
||||
get_open_port,
|
||||
update_environment_variables,
|
||||
)
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
from vllm.utils.torch_utils import cuda_device_count_stateless
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
141
tests/entrypoints/anthropic/test_messages.py
Normal file
141
tests/entrypoints/anthropic/test_messages.py
Normal file
@ -0,0 +1,141 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import anthropic
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...utils import RemoteAnthropicServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(): # noqa: F811
|
||||
args = [
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--enforce-eager",
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"hermes",
|
||||
"--served-model-name",
|
||||
"claude-3-7-sonnet-latest",
|
||||
]
|
||||
|
||||
with RemoteAnthropicServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_simple_messages(client: anthropic.AsyncAnthropic):
|
||||
resp = await client.messages.create(
|
||||
model="claude-3-7-sonnet-latest",
|
||||
max_tokens=1024,
|
||||
messages=[{"role": "user", "content": "how are you!"}],
|
||||
)
|
||||
assert resp.stop_reason == "end_turn"
|
||||
assert resp.role == "assistant"
|
||||
|
||||
print(f"Anthropic response: {resp.model_dump_json()}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_system_message(client: anthropic.AsyncAnthropic):
|
||||
resp = await client.messages.create(
|
||||
model="claude-3-7-sonnet-latest",
|
||||
max_tokens=1024,
|
||||
system="you are a helpful assistant",
|
||||
messages=[{"role": "user", "content": "how are you!"}],
|
||||
)
|
||||
assert resp.stop_reason == "end_turn"
|
||||
assert resp.role == "assistant"
|
||||
|
||||
print(f"Anthropic response: {resp.model_dump_json()}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):
|
||||
resp = await client.messages.create(
|
||||
model="claude-3-7-sonnet-latest",
|
||||
max_tokens=1024,
|
||||
messages=[{"role": "user", "content": "how are you!"}],
|
||||
stream=True,
|
||||
)
|
||||
|
||||
async for chunk in resp:
|
||||
print(chunk.model_dump_json())
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic):
|
||||
resp = await client.messages.create(
|
||||
model="claude-3-7-sonnet-latest",
|
||||
max_tokens=1024,
|
||||
messages=[
|
||||
{"role": "user", "content": "What's the weather like in New York today?"}
|
||||
],
|
||||
tools=[
|
||||
{
|
||||
"name": "get_current_weather",
|
||||
"description": "Useful for querying the weather in a specified city.",
|
||||
"input_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "City or region, for example: "
|
||||
"New York, London, Tokyo, etc.",
|
||||
}
|
||||
},
|
||||
"required": ["location"],
|
||||
},
|
||||
}
|
||||
],
|
||||
stream=False,
|
||||
)
|
||||
assert resp.stop_reason == "tool_use"
|
||||
assert resp.role == "assistant"
|
||||
|
||||
print(f"Anthropic response: {resp.model_dump_json()}")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_anthropic_tool_call_streaming(client: anthropic.AsyncAnthropic):
|
||||
resp = await client.messages.create(
|
||||
model="claude-3-7-sonnet-latest",
|
||||
max_tokens=1024,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's the weather like in New York today?",
|
||||
}
|
||||
],
|
||||
tools=[
|
||||
{
|
||||
"name": "get_current_weather",
|
||||
"description": "Useful for querying the weather "
|
||||
"in a specified city.",
|
||||
"input_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "City or region, for example: "
|
||||
"New York, London, Tokyo, etc.",
|
||||
}
|
||||
},
|
||||
"required": ["location"],
|
||||
},
|
||||
}
|
||||
],
|
||||
stream=True,
|
||||
)
|
||||
|
||||
async for chunk in resp:
|
||||
print(chunk.model_dump_json())
|
||||
@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
@ -12,8 +13,22 @@ def test_empty_prompt():
|
||||
llm.generate([""])
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
def test_out_of_vocab_token():
|
||||
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
|
||||
with pytest.raises(ValueError, match="out of vocabulary"):
|
||||
llm.generate({"prompt_token_ids": [999999]})
|
||||
|
||||
|
||||
def test_require_mm_embeds():
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
enforce_eager=True,
|
||||
enable_mm_embeds=False,
|
||||
)
|
||||
with pytest.raises(ValueError, match="--enable-mm-embeds"):
|
||||
llm.generate(
|
||||
{
|
||||
"prompt": "<image>",
|
||||
"multi_modal_data": {"image": torch.empty(1, 1, 1)},
|
||||
}
|
||||
)
|
||||
|
||||
@ -599,145 +599,6 @@ async def test_structured_outputs_choice_chat_logprobs(
|
||||
assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_named_tool_use(
|
||||
client: openai.AsyncOpenAI,
|
||||
sample_json_schema,
|
||||
):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"Give an example JSON for an employee profile using the specified tool."
|
||||
),
|
||||
},
|
||||
]
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema,
|
||||
},
|
||||
}
|
||||
]
|
||||
tool_choice = {"type": "function", "function": {"name": "dummy_function_name"}}
|
||||
|
||||
# non-streaming
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert len(message.content) == 0
|
||||
json_string = message.tool_calls[0].function.arguments
|
||||
json1 = json.loads(json_string)
|
||||
jsonschema.validate(instance=json1, schema=sample_json_schema)
|
||||
|
||||
messages.append({"role": "assistant", "content": json_string})
|
||||
messages.append(
|
||||
{"role": "user", "content": "Give me another one with a different name and age"}
|
||||
)
|
||||
|
||||
# streaming
|
||||
|
||||
stream = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
output = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.role:
|
||||
assert delta.role == "assistant"
|
||||
assert delta.content is None or len(delta.content) == 0
|
||||
if delta.tool_calls:
|
||||
output.append(delta.tool_calls[0].function.arguments)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1
|
||||
json2 = json.loads("".join(output))
|
||||
jsonschema.validate(instance=json2, schema=sample_json_schema)
|
||||
assert json1["name"] != json2["name"]
|
||||
assert json1["age"] != json2["age"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_inconsistent_tool_choice_and_tools(
|
||||
client: openai.AsyncOpenAI, sample_json_schema
|
||||
):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Give an example JSON for an employee profile that "
|
||||
f"fits this schema: {sample_json_schema}",
|
||||
},
|
||||
]
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {"name": "dummy_function_name"},
|
||||
},
|
||||
)
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema,
|
||||
},
|
||||
}
|
||||
],
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {"name": "nondefined_function_name"},
|
||||
},
|
||||
)
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema,
|
||||
},
|
||||
}
|
||||
],
|
||||
tool_choice={},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_response_format_json_object(client: openai.AsyncOpenAI):
|
||||
for _ in range(2):
|
||||
|
||||
@ -114,7 +114,9 @@ def test_get_gen_prompt(
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
revision=model_info.revision,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
skip_tokenizer_init=model_info.skip_tokenizer_init,
|
||||
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||
enable_mm_embeds=model_info.require_embed_inputs,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
|
||||
@ -2,7 +2,9 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import datetime
|
||||
import json
|
||||
|
||||
import jsonschema
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
@ -194,11 +196,19 @@ async def test_function_tool_use(
|
||||
)
|
||||
|
||||
output = []
|
||||
reasoning = []
|
||||
async for chunk in output_stream:
|
||||
if chunk.choices and chunk.choices[0].delta.tool_calls:
|
||||
output.extend(chunk.choices[0].delta.tool_calls)
|
||||
if chunk.choices:
|
||||
if enable_thinking and getattr(
|
||||
chunk.choices[0].delta, "reasoning_content", None
|
||||
):
|
||||
reasoning.append(chunk.choices[0].delta.reasoning_content)
|
||||
if chunk.choices[0].delta.tool_calls:
|
||||
output.extend(chunk.choices[0].delta.tool_calls)
|
||||
|
||||
assert len(output) > 0
|
||||
if enable_thinking:
|
||||
assert len(reasoning) > 0
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@ -333,3 +343,144 @@ async def test_no_args_tool_call(
|
||||
else:
|
||||
# No tool called — just print model's direct reply
|
||||
assert message.content is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_named_tool_use(
|
||||
client: openai.AsyncOpenAI,
|
||||
sample_json_schema,
|
||||
):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"Give an example JSON for an employee profile using the specified tool."
|
||||
),
|
||||
},
|
||||
]
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema,
|
||||
},
|
||||
}
|
||||
]
|
||||
tool_choice = {"type": "function", "function": {"name": "dummy_function_name"}}
|
||||
|
||||
# non-streaming
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=tools,
|
||||
temperature=0.0,
|
||||
tool_choice=tool_choice,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert len(message.content) == 0
|
||||
json_string = message.tool_calls[0].function.arguments
|
||||
json1 = json.loads(json_string)
|
||||
jsonschema.validate(instance=json1, schema=sample_json_schema)
|
||||
|
||||
messages.append({"role": "assistant", "content": json_string})
|
||||
messages.append(
|
||||
{"role": "user", "content": "Give me another one with a different name and age"}
|
||||
)
|
||||
|
||||
# streaming
|
||||
|
||||
stream = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
output = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.role:
|
||||
assert delta.role == "assistant"
|
||||
assert delta.content is None or len(delta.content) == 0
|
||||
if delta.tool_calls:
|
||||
output.append(delta.tool_calls[0].function.arguments)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1
|
||||
json2 = json.loads("".join(output))
|
||||
jsonschema.validate(instance=json2, schema=sample_json_schema)
|
||||
assert json1["name"] != json2["name"]
|
||||
assert json1["age"] != json2["age"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_inconsistent_tool_choice_and_tools(
|
||||
client: openai.AsyncOpenAI, sample_json_schema
|
||||
):
|
||||
messages = [
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Give an example JSON for an employee profile that "
|
||||
f"fits this schema: {sample_json_schema}",
|
||||
},
|
||||
]
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {"name": "dummy_function_name"},
|
||||
},
|
||||
)
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema,
|
||||
},
|
||||
}
|
||||
],
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {"name": "nondefined_function_name"},
|
||||
},
|
||||
)
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema,
|
||||
},
|
||||
}
|
||||
],
|
||||
tool_choice={},
|
||||
)
|
||||
|
||||
@ -292,3 +292,16 @@ async def test_prompt_logprobs_raises_error(
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": encoded_embeds, "prompt_logprobs": True},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_prompt_embeds(
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI,
|
||||
) -> None:
|
||||
await client_with_prompt_embeds.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="Hello",
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": []},
|
||||
)
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import io
|
||||
from unittest.mock import Mock
|
||||
|
||||
# imports for structured outputs tests
|
||||
import openai
|
||||
@ -10,7 +11,8 @@ import pytest
|
||||
import regex as re
|
||||
import torch
|
||||
|
||||
from vllm.entrypoints.renderer import BaseRenderer
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.renderer import CompletionRenderer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
@ -59,6 +61,10 @@ async def test_out_of_vocab_token_ids():
|
||||
def test_load_prompt_embeds(
|
||||
dtype: torch.dtype, layout: torch.layout, seq_len: int, hidden_size: int
|
||||
):
|
||||
model_config = Mock(spec=ModelConfig)
|
||||
model_config.enable_prompt_embeds = True
|
||||
renderer = CompletionRenderer(model_config, tokenizer=None)
|
||||
|
||||
# construct arbitrary tensors of various dtypes, layouts, and sizes.
|
||||
# We need to check against different layouts to make sure that if a user
|
||||
# uses sparse tensors to reduce the transmission size of prompt embeddings,
|
||||
@ -83,7 +89,7 @@ def test_load_prompt_embeds(
|
||||
buffer.seek(0)
|
||||
encoded_tensor = pybase64.b64encode(buffer.getvalue())
|
||||
|
||||
loaded_prompt_embeds = BaseRenderer.load_prompt_embeds(encoded_tensor)
|
||||
loaded_prompt_embeds = renderer.load_prompt_embeds(encoded_tensor)
|
||||
assert len(loaded_prompt_embeds) == 1
|
||||
loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
|
||||
assert loaded_tensor.device.type == "cpu"
|
||||
@ -91,3 +97,22 @@ def test_load_prompt_embeds(
|
||||
torch.testing.assert_close(
|
||||
loaded_tensor, tensor.to("cpu").to_dense(), equal_nan=True
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [torch.float32])
|
||||
@pytest.mark.parametrize("seq_len", [2])
|
||||
@pytest.mark.parametrize("hidden_size", [2])
|
||||
def test_disable_prompt_embeds(dtype: torch.dtype, seq_len: int, hidden_size: int):
|
||||
model_config = Mock(spec=ModelConfig)
|
||||
model_config.enable_prompt_embeds = False
|
||||
renderer = CompletionRenderer(model_config, tokenizer=None)
|
||||
|
||||
tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
|
||||
|
||||
buffer = io.BytesIO()
|
||||
torch.save(tensor, buffer)
|
||||
buffer.seek(0)
|
||||
encoded_tensor = pybase64.b64encode(buffer.getvalue())
|
||||
|
||||
with pytest.raises(ValueError, match="--enable-prompt-embeds"):
|
||||
renderer.load_prompt_embeds(encoded_tensor)
|
||||
|
||||
330
tests/entrypoints/openai/test_responses_function_call_parsing.py
Normal file
330
tests/entrypoints/openai/test_responses_function_call_parsing.py
Normal file
@ -0,0 +1,330 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test function call parsing in ResponsesRequest."""
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
from openai.types.responses import ResponseFunctionToolCall
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ResponsesRequest
|
||||
|
||||
|
||||
def test_function_call_dict_converted_to_object():
|
||||
"""Test that function_call dictionaries are correctly parsed into
|
||||
ResponseFunctionToolCall objects."""
|
||||
# Create a request with function_call as dict
|
||||
request_data = {
|
||||
"model": "gpt-oss",
|
||||
"input": [
|
||||
{
|
||||
"type": "function_call",
|
||||
"call_id": "fc_123",
|
||||
"name": "get_weather",
|
||||
"arguments": '{"location": "Boston", "unit": "celsius"}',
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
request = ResponsesRequest(**request_data)
|
||||
|
||||
# Verify the input item is now a ResponseFunctionToolCall object
|
||||
assert len(request.input) == 1
|
||||
assert isinstance(request.input[0], ResponseFunctionToolCall)
|
||||
assert request.input[0].call_id == "fc_123"
|
||||
assert request.input[0].name == "get_weather"
|
||||
assert request.input[0].arguments == '{"location": "Boston", "unit": "celsius"}'
|
||||
|
||||
|
||||
def test_direct_function_call_object_preservation():
|
||||
"""Test that ResponseFunctionToolCall objects passed directly are preserved."""
|
||||
# Create a request with ResponseFunctionToolCall object
|
||||
function_call = ResponseFunctionToolCall(
|
||||
type="function_call",
|
||||
call_id="fc_456",
|
||||
name="get_stock_price",
|
||||
arguments='{"symbol": "AAPL"}',
|
||||
)
|
||||
|
||||
request_data = {"model": "gpt-oss", "input": [function_call]}
|
||||
|
||||
request = ResponsesRequest(**request_data)
|
||||
|
||||
# Verify the object is preserved
|
||||
assert len(request.input) == 1
|
||||
assert request.input[0] is function_call
|
||||
|
||||
|
||||
def test_mixed_input_types_with_function_calls():
|
||||
"""Test parsing with mixed input types including function calls."""
|
||||
|
||||
request_data = {
|
||||
"model": "gpt-oss",
|
||||
"input": [
|
||||
# Valid Message type
|
||||
{
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": [{"type": "input_text", "text": "What's the weather?"}],
|
||||
},
|
||||
# Function call that should be parsed
|
||||
{
|
||||
"type": "function_call",
|
||||
"call_id": "fc_789",
|
||||
"name": "check_weather",
|
||||
"arguments": '{"location": "NYC"}',
|
||||
},
|
||||
# Another function call
|
||||
{
|
||||
"type": "function_call",
|
||||
"call_id": "fc_790",
|
||||
"name": "get_time",
|
||||
"arguments": "{}",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
request = ResponsesRequest(**request_data)
|
||||
|
||||
# Verify mixed types are handled correctly
|
||||
assert len(request.input) == 3
|
||||
# First item should be validated as Message
|
||||
assert request.input[0]["type"] == "message"
|
||||
# Second item should be parsed to ResponseFunctionToolCall
|
||||
assert isinstance(request.input[1], ResponseFunctionToolCall)
|
||||
assert request.input[1].call_id == "fc_789"
|
||||
assert request.input[1].name == "check_weather"
|
||||
# Third item should also be parsed to ResponseFunctionToolCall
|
||||
assert isinstance(request.input[2], ResponseFunctionToolCall)
|
||||
assert request.input[2].call_id == "fc_790"
|
||||
assert request.input[2].name == "get_time"
|
||||
|
||||
|
||||
def test_function_call_with_complex_arguments():
|
||||
"""Test parsing function calls with complex nested arguments."""
|
||||
complex_args = {
|
||||
"query": "weather forecast",
|
||||
"filters": {
|
||||
"location": {"city": "San Francisco", "state": "CA"},
|
||||
"timeRange": {"start": "2024-01-01", "end": "2024-01-07"},
|
||||
"metrics": ["temperature", "humidity", "precipitation"],
|
||||
},
|
||||
"options": {"format": "detailed", "includeAlerts": True},
|
||||
}
|
||||
|
||||
request_data = {
|
||||
"model": "gpt-oss",
|
||||
"input": [
|
||||
{
|
||||
"type": "function_call",
|
||||
"call_id": "fc_complex",
|
||||
"name": "advanced_weather_query",
|
||||
"arguments": json.dumps(complex_args),
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
request = ResponsesRequest(**request_data)
|
||||
|
||||
# Verify complex arguments are preserved correctly
|
||||
assert len(request.input) == 1
|
||||
assert isinstance(request.input[0], ResponseFunctionToolCall)
|
||||
assert request.input[0].call_id == "fc_complex"
|
||||
assert request.input[0].name == "advanced_weather_query"
|
||||
|
||||
# Parse the arguments back to verify they're intact
|
||||
parsed_args = json.loads(request.input[0].arguments)
|
||||
assert parsed_args == complex_args
|
||||
|
||||
|
||||
def test_invalid_function_call_fallback():
|
||||
"""Test that invalid function call dictionaries fall back gracefully."""
|
||||
# Missing required field 'call_id'
|
||||
request_data = {
|
||||
"model": "gpt-oss",
|
||||
"input": [
|
||||
{"type": "function_call", "name": "incomplete_function", "arguments": "{}"}
|
||||
],
|
||||
}
|
||||
|
||||
# This should not raise an error during model creation
|
||||
# The validator should keep the original dict and let Pydantic
|
||||
# handle validation
|
||||
with pytest.raises(ValueError):
|
||||
# Pydantic should raise a validation error for the invalid structure
|
||||
ResponsesRequest(**request_data)
|
||||
|
||||
|
||||
def test_string_input_not_affected():
|
||||
"""Test that string input is not affected by the validator."""
|
||||
request_data = {"model": "gpt-oss", "input": "This is a simple string input"}
|
||||
|
||||
request = ResponsesRequest(**request_data)
|
||||
|
||||
# Verify string input remains unchanged
|
||||
assert request.input == "This is a simple string input"
|
||||
|
||||
|
||||
def test_empty_list_input():
|
||||
"""Test that empty list input is handled correctly."""
|
||||
request_data = {"model": "gpt-oss", "input": []}
|
||||
|
||||
request = ResponsesRequest(**request_data)
|
||||
|
||||
# Verify empty list is preserved
|
||||
assert request.input == []
|
||||
|
||||
|
||||
def test_function_call_output_not_affected():
|
||||
"""Test that FunctionCallOutput is not affected by the function_call parsing."""
|
||||
|
||||
# Test with FunctionCallOutput as dict (should not be parsed)
|
||||
request_data = {
|
||||
"model": "gpt-oss",
|
||||
"input": [
|
||||
{
|
||||
"type": "function_call_output",
|
||||
"call_id": "fc_output_123",
|
||||
"output": "The weather in Boston is 72°F and sunny.",
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
request = ResponsesRequest(**request_data)
|
||||
|
||||
# FunctionCallOutput should remain as dict (not converted to an object)
|
||||
assert len(request.input) == 1
|
||||
assert isinstance(request.input[0], dict)
|
||||
assert request.input[0]["type"] == "function_call_output"
|
||||
assert request.input[0]["call_id"] == "fc_output_123"
|
||||
assert request.input[0]["output"] == "The weather in Boston is 72°F and sunny."
|
||||
|
||||
|
||||
def test_mixed_function_call_and_output():
|
||||
"""Test that function_call is parsed while function_call_output is preserved."""
|
||||
request_data = {
|
||||
"model": "gpt-oss",
|
||||
"input": [
|
||||
# This should be parsed to ResponseFunctionToolCall
|
||||
{
|
||||
"type": "function_call",
|
||||
"call_id": "fc_call_456",
|
||||
"name": "get_weather",
|
||||
"arguments": '{"location": "NYC"}',
|
||||
},
|
||||
# This should remain as dict
|
||||
{
|
||||
"type": "function_call_output",
|
||||
"call_id": "fc_call_456",
|
||||
"output": "NYC weather is 68°F with light rain",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
request = ResponsesRequest(**request_data)
|
||||
|
||||
assert len(request.input) == 2
|
||||
|
||||
# First item should be parsed to ResponseFunctionToolCall
|
||||
assert isinstance(request.input[0], ResponseFunctionToolCall)
|
||||
assert request.input[0].call_id == "fc_call_456"
|
||||
assert request.input[0].name == "get_weather"
|
||||
|
||||
# Second item should remain as dict (FunctionCallOutput)
|
||||
assert isinstance(request.input[1], dict)
|
||||
assert request.input[1]["type"] == "function_call_output"
|
||||
assert request.input[1]["call_id"] == "fc_call_456"
|
||||
assert request.input[1]["output"] == "NYC weather is 68°F with light rain"
|
||||
|
||||
|
||||
def test_function_call_validation_failure_logs_debug(caplog):
|
||||
"""Test that validation failures are logged at debug level."""
|
||||
from unittest.mock import patch
|
||||
|
||||
request_data = {
|
||||
"model": "gpt-oss",
|
||||
"input": [
|
||||
{
|
||||
"type": "function_call",
|
||||
"name": "incomplete_function",
|
||||
"arguments": "{}", # Missing call_id
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# Mock the logger to verify debug was called
|
||||
with patch("vllm.entrypoints.openai.protocol.logger") as mock_logger:
|
||||
with pytest.raises(ValueError):
|
||||
ResponsesRequest(**request_data)
|
||||
|
||||
# Verify debug was called with expected message
|
||||
mock_logger.debug.assert_called_once()
|
||||
call_args = mock_logger.debug.call_args[0][0]
|
||||
assert "Failed to parse function_call" in call_args
|
||||
|
||||
|
||||
def test_validator_handles_iterator_input():
|
||||
"""Test that validator can handle ValidatorIterator input (Pydantic internal)."""
|
||||
|
||||
# This test simulates when Pydantic passes a ValidatorIterator instead of a list
|
||||
# This happened with complex nested structures containing reasoning + function_call
|
||||
|
||||
# Create test data that would normally be a list
|
||||
test_input_items = [
|
||||
{
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": [{"type": "input_text", "text": "Test"}],
|
||||
},
|
||||
{
|
||||
"type": "reasoning",
|
||||
"id": "rs_1",
|
||||
"summary": [{"type": "summary_text", "text": "Test reasoning"}],
|
||||
"content": [{"type": "reasoning_text", "text": "Test content"}],
|
||||
},
|
||||
{
|
||||
"type": "function_call",
|
||||
"call_id": "call_1",
|
||||
"name": "test_function",
|
||||
"arguments": '{"test": "value"}',
|
||||
"id": "fc_1",
|
||||
},
|
||||
]
|
||||
|
||||
# Mock data where input is an iterator (simulates Pydantic ValidatorIterator)
|
||||
mock_data = {
|
||||
"model": "test-model",
|
||||
"input": iter(test_input_items), # Iterator instead of list
|
||||
}
|
||||
|
||||
# This should NOT raise an error with the fixed validator
|
||||
try:
|
||||
request = ResponsesRequest(**mock_data)
|
||||
|
||||
# Verify the validator processed the data correctly
|
||||
assert len(request.input) == 3
|
||||
|
||||
# Verify function_call was converted to ResponseFunctionToolCall object
|
||||
function_call_item = None
|
||||
for item in request.input:
|
||||
if isinstance(item, ResponseFunctionToolCall):
|
||||
function_call_item = item
|
||||
break
|
||||
|
||||
assert function_call_item is not None
|
||||
assert function_call_item.call_id == "call_1"
|
||||
assert function_call_item.name == "test_function"
|
||||
|
||||
except Exception as e:
|
||||
pytest.fail(f"Validator should handle iterator input, but failed with: {e}")
|
||||
|
||||
|
||||
def test_validator_handles_empty_iterator():
|
||||
"""Test validator handles empty iterator gracefully."""
|
||||
mock_data = {
|
||||
"model": "test-model",
|
||||
"input": iter([]), # Empty iterator
|
||||
}
|
||||
|
||||
request = ResponsesRequest(**mock_data)
|
||||
assert request.input == []
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user