[Misc] Clean up flags in vllm bench serve (#25138)
Signed-off-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
@ -156,7 +156,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--endpoint-type openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name hf \
|
||||
@ -230,7 +229,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--endpoint-type openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name hf \
|
||||
@ -245,7 +243,6 @@ vllm bench serve \
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--endpoint-type openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name hf \
|
||||
|
||||
@ -68,7 +68,7 @@ def test_bench_serve_chat(server):
|
||||
"5",
|
||||
"--endpoint",
|
||||
"/v1/chat/completions",
|
||||
"--endpoint-type",
|
||||
"--backend",
|
||||
"openai-chat",
|
||||
]
|
||||
result = subprocess.run(command, capture_output=True, text=True)
|
||||
|
||||
@ -1358,7 +1358,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
|
||||
elif args.dataset_name == "sonnet":
|
||||
dataset = SonnetDataset(dataset_path=args.dataset_path)
|
||||
# For the "sonnet" dataset, formatting depends on the backend.
|
||||
if args.endpoint_type == "openai-chat":
|
||||
if args.backend == "openai-chat":
|
||||
input_requests = dataset.sample(
|
||||
num_requests=args.num_prompts,
|
||||
input_len=args.sonnet_input_len,
|
||||
@ -1462,7 +1462,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
|
||||
"Please consider contributing if you would "
|
||||
"like to add support for additional dataset formats.")
|
||||
|
||||
if dataset_class.IS_MULTIMODAL and args.endpoint_type not in [
|
||||
if dataset_class.IS_MULTIMODAL and args.backend not in [
|
||||
"openai-chat",
|
||||
"openai-audio",
|
||||
]:
|
||||
@ -1470,7 +1470,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
|
||||
# endpoint-type.
|
||||
raise ValueError(
|
||||
"Multi-modal content is only supported on 'openai-chat' and "
|
||||
"'openai-audio' endpoint-type.")
|
||||
"'openai-audio' backends.")
|
||||
input_requests = dataset_class(
|
||||
dataset_path=args.dataset_path,
|
||||
dataset_subset=args.hf_subset,
|
||||
@ -1563,7 +1563,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
|
||||
|
||||
try:
|
||||
# Enforce endpoint compatibility for multimodal datasets.
|
||||
if args.dataset_name == "random-mm" and args.endpoint_type not in [
|
||||
if args.dataset_name == "random-mm" and args.backend not in [
|
||||
"openai-chat"]:
|
||||
raise ValueError(
|
||||
"Multi-modal content (images) is only supported on "
|
||||
|
||||
@ -8,8 +8,8 @@ to launch the vLLM OpenAI API server:
|
||||
|
||||
On the client side, run:
|
||||
vllm bench serve \
|
||||
--endpoint-type <endpoint_type. Default 'openai'> \
|
||||
--label <benchmark result label. Default using endpoint_type> \
|
||||
--backend <backend or endpoint type. Default 'openai'> \
|
||||
--label <benchmark result label. Default using backend> \
|
||||
--model <your_model> \
|
||||
--dataset-name <dataset_name. Default 'random'> \
|
||||
--request-rate <request_rate. Default inf> \
|
||||
@ -52,6 +52,21 @@ TERM_PLOTLIB_AVAILABLE = ((importlib.util.find_spec("termplotlib") is not None)
|
||||
and (shutil.which("gnuplot") is not None))
|
||||
|
||||
|
||||
# TODO: Remove this in v0.11.0
|
||||
class DeprecatedEndpointTypeAction(argparse.Action):
|
||||
"""Argparse action for the deprecated --endpoint-type flag.
|
||||
"""
|
||||
|
||||
def __call__(self, _, namespace, values, option_string=None):
|
||||
warnings.warn(
|
||||
"'--endpoint-type' is deprecated and will be removed in v0.11.0. "
|
||||
"Please use '--backend' instead or remove this argument if you "
|
||||
"have already set it.",
|
||||
stacklevel=1,
|
||||
)
|
||||
setattr(namespace, self.dest, values)
|
||||
|
||||
|
||||
class TaskType(Enum):
|
||||
GENERATION = "generation"
|
||||
EMBEDDING = "embedding"
|
||||
@ -470,7 +485,7 @@ async def benchmark(
|
||||
else:
|
||||
request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
|
||||
else:
|
||||
raise ValueError(f"Unknown endpoint_type: {endpoint_type}")
|
||||
raise ValueError(f"Unknown backend: {endpoint_type}")
|
||||
|
||||
# Reuses connections across requests to reduce TLS handshake overhead.
|
||||
connector = aiohttp.TCPConnector(
|
||||
@ -850,24 +865,28 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
|
||||
def add_cli_args(parser: argparse.ArgumentParser):
|
||||
add_dataset_parser(parser)
|
||||
parser.add_argument(
|
||||
"--endpoint-type",
|
||||
type=str,
|
||||
default="openai",
|
||||
choices=list(ASYNC_REQUEST_FUNCS.keys()),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--label",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The label (prefix) of the benchmark results. If not specified, "
|
||||
"the endpoint type will be used as the label.",
|
||||
"the value of '--backend' will be used as the label.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--backend",
|
||||
type=str,
|
||||
default="vllm",
|
||||
default="openai",
|
||||
choices=list(ASYNC_REQUEST_FUNCS.keys()),
|
||||
help="The type of backend or endpoint to use for the benchmark."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--endpoint-type",
|
||||
type=str,
|
||||
default=None,
|
||||
choices=list(ASYNC_REQUEST_FUNCS.keys()),
|
||||
action=DeprecatedEndpointTypeAction,
|
||||
help="'--endpoint-type' is deprecated and will be removed in v0.11.0. "
|
||||
"Please use '--backend' instead.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-url",
|
||||
@ -1165,7 +1184,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
|
||||
raise ValueError(
|
||||
"For exponential ramp-up, the start RPS cannot be 0.")
|
||||
|
||||
endpoint_type = args.endpoint_type
|
||||
label = args.label
|
||||
model_id = args.model
|
||||
model_name = args.served_model_name
|
||||
@ -1228,7 +1246,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
|
||||
gc.freeze()
|
||||
|
||||
benchmark_result = await benchmark(
|
||||
endpoint_type=args.endpoint_type,
|
||||
endpoint_type=args.backend,
|
||||
api_url=api_url,
|
||||
base_url=base_url,
|
||||
model_id=model_id,
|
||||
@ -1262,7 +1280,8 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
|
||||
# Setup
|
||||
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
result_json["date"] = current_dt
|
||||
result_json["endpoint_type"] = args.endpoint_type
|
||||
result_json["endpoint_type"] = args.backend # for backward compatibility
|
||||
result_json["backend"] = args.backend
|
||||
result_json["label"] = label
|
||||
result_json["model_id"] = model_id
|
||||
result_json["tokenizer_id"] = tokenizer_id
|
||||
@ -1312,7 +1331,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
|
||||
base_model_id = model_id.split("/")[-1]
|
||||
max_concurrency_str = (f"-concurrency{args.max_concurrency}"
|
||||
if args.max_concurrency is not None else "")
|
||||
label = label or endpoint_type
|
||||
label = label or args.backend
|
||||
if args.ramp_up_strategy is not None:
|
||||
file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user