[Frontend] Add vllm bench sweep to CLI (#27639)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@ -5,4 +5,4 @@ nav:
|
||||
- complete.md
|
||||
- run-batch.md
|
||||
- vllm bench:
|
||||
- bench/*.md
|
||||
- bench/**/*.md
|
||||
|
||||
9
docs/cli/bench/sweep/plot.md
Normal file
9
docs/cli/bench/sweep/plot.md
Normal file
@ -0,0 +1,9 @@
|
||||
# vllm bench sweep plot
|
||||
|
||||
## JSON CLI Arguments
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
|
||||
--8<-- "docs/argparse/bench_sweep_plot.md"
|
||||
9
docs/cli/bench/sweep/serve.md
Normal file
9
docs/cli/bench/sweep/serve.md
Normal file
@ -0,0 +1,9 @@
|
||||
# vllm bench sweep serve
|
||||
|
||||
## JSON CLI Arguments
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
|
||||
--8<-- "docs/argparse/bench_sweep_serve.md"
|
||||
9
docs/cli/bench/sweep/serve_sla.md
Normal file
9
docs/cli/bench/sweep/serve_sla.md
Normal file
@ -0,0 +1,9 @@
|
||||
# vllm bench sweep serve_sla
|
||||
|
||||
## JSON CLI Arguments
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
|
||||
--8<-- "docs/argparse/bench_sweep_serve_sla.md"
|
||||
@ -1061,7 +1061,7 @@ Follow these steps to run the script:
|
||||
Example command:
|
||||
|
||||
```bash
|
||||
python -m vllm.benchmarks.sweep.serve \
|
||||
vllm bench sweep serve \
|
||||
--serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
|
||||
--bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
|
||||
--serve-params benchmarks/serve_hparams.json \
|
||||
@ -1109,7 +1109,7 @@ For example, to ensure E2E latency within different target values for 99% of req
|
||||
Example command:
|
||||
|
||||
```bash
|
||||
python -m vllm.benchmarks.sweep.serve_sla \
|
||||
vllm bench sweep serve_sla \
|
||||
--serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
|
||||
--bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
|
||||
--serve-params benchmarks/serve_hparams.json \
|
||||
@ -1138,7 +1138,7 @@ The algorithm for adjusting the SLA variable is as follows:
|
||||
Example command:
|
||||
|
||||
```bash
|
||||
python -m vllm.benchmarks.sweep.plot benchmarks/results/<timestamp> \
|
||||
vllm bench sweep plot benchmarks/results/<timestamp> \
|
||||
--var-x max_concurrency \
|
||||
--row-by random_input_len \
|
||||
--col-by random_output_len \
|
||||
|
||||
@ -56,15 +56,20 @@ def auto_mock(module, attr, max_mocks=50):
|
||||
)
|
||||
|
||||
|
||||
latency = auto_mock("vllm.benchmarks", "latency")
|
||||
serve = auto_mock("vllm.benchmarks", "serve")
|
||||
throughput = auto_mock("vllm.benchmarks", "throughput")
|
||||
bench_latency = auto_mock("vllm.benchmarks", "latency")
|
||||
bench_serve = auto_mock("vllm.benchmarks", "serve")
|
||||
bench_sweep_plot = auto_mock("vllm.benchmarks.sweep.plot", "SweepPlotArgs")
|
||||
bench_sweep_serve = auto_mock("vllm.benchmarks.sweep.serve", "SweepServeArgs")
|
||||
bench_sweep_serve_sla = auto_mock(
|
||||
"vllm.benchmarks.sweep.serve_sla", "SweepServeSLAArgs"
|
||||
)
|
||||
bench_throughput = auto_mock("vllm.benchmarks", "throughput")
|
||||
AsyncEngineArgs = auto_mock("vllm.engine.arg_utils", "AsyncEngineArgs")
|
||||
EngineArgs = auto_mock("vllm.engine.arg_utils", "EngineArgs")
|
||||
ChatCommand = auto_mock("vllm.entrypoints.cli.openai", "ChatCommand")
|
||||
CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
|
||||
cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
|
||||
run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
|
||||
openai_cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
|
||||
openai_run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
|
||||
FlexibleArgumentParser = auto_mock(
|
||||
"vllm.utils.argparse_utils", "FlexibleArgumentParser"
|
||||
)
|
||||
@ -114,6 +119,9 @@ class MarkdownFormatter(HelpFormatter):
|
||||
self._markdown_output.append(f"{action.help}\n\n")
|
||||
|
||||
if (default := action.default) != SUPPRESS:
|
||||
# Make empty string defaults visible
|
||||
if default == "":
|
||||
default = '""'
|
||||
self._markdown_output.append(f"Default: `{default}`\n\n")
|
||||
|
||||
def format_help(self):
|
||||
@ -150,17 +158,23 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
|
||||
# Create parsers to document
|
||||
parsers = {
|
||||
# Engine args
|
||||
"engine_args": create_parser(EngineArgs.add_cli_args),
|
||||
"async_engine_args": create_parser(
|
||||
AsyncEngineArgs.add_cli_args, async_args_only=True
|
||||
),
|
||||
"serve": create_parser(cli_args.make_arg_parser),
|
||||
# CLI
|
||||
"serve": create_parser(openai_cli_args.make_arg_parser),
|
||||
"chat": create_parser(ChatCommand.add_cli_args),
|
||||
"complete": create_parser(CompleteCommand.add_cli_args),
|
||||
"bench_latency": create_parser(latency.add_cli_args),
|
||||
"bench_throughput": create_parser(throughput.add_cli_args),
|
||||
"bench_serve": create_parser(serve.add_cli_args),
|
||||
"run-batch": create_parser(run_batch.make_arg_parser),
|
||||
"run-batch": create_parser(openai_run_batch.make_arg_parser),
|
||||
# Benchmark CLI
|
||||
"bench_latency": create_parser(bench_latency.add_cli_args),
|
||||
"bench_serve": create_parser(bench_serve.add_cli_args),
|
||||
"bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args),
|
||||
"bench_sweep_serve": create_parser(bench_sweep_serve.add_cli_args),
|
||||
"bench_sweep_serve_sla": create_parser(bench_sweep_serve_sla.add_cli_args),
|
||||
"bench_throughput": create_parser(bench_throughput.add_cli_args),
|
||||
}
|
||||
|
||||
# Generate documentation for each parser
|
||||
|
||||
Reference in New Issue
Block a user