diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index f022a55e62..96e39fd92e 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -37,6 +37,7 @@ def run_vllm( requests: list[SampleRequest], n: int, engine_args: EngineArgs, + do_profile: bool, disable_detokenize: bool = False, ) -> tuple[float, Optional[list[RequestOutput]]]: from vllm import LLM, SamplingParams @@ -75,10 +76,14 @@ def run_vllm( outputs = None if not use_beam_search: start = time.perf_counter() + if do_profile: + llm.start_profile() outputs = llm.generate(prompts, sampling_params, lora_request=lora_requests, use_tqdm=True) + if do_profile: + llm.stop_profile() end = time.perf_counter() else: assert lora_requests is None, "BeamSearch API does not support LoRA" @@ -88,6 +93,8 @@ def run_vllm( for request in requests: assert request.expected_output_len == output_len start = time.perf_counter() + if do_profile: + llm.start_profile() llm.beam_search( prompts, BeamSearchParams( @@ -95,6 +102,8 @@ def run_vllm( max_tokens=output_len, ignore_eos=True, )) + if do_profile: + llm.stop_profile() end = time.perf_counter() return end - start, outputs @@ -103,6 +112,7 @@ def run_vllm_chat( requests: list[SampleRequest], n: int, engine_args: EngineArgs, + do_profile: bool, disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]: """ Run vLLM chat benchmark. This function is recommended ONLY for benchmarking @@ -133,7 +143,11 @@ def run_vllm_chat( detokenize=not disable_detokenize, )) start = time.perf_counter() + if do_profile: + llm.start_profile() outputs = llm.chat(prompts, sampling_params, use_tqdm=True) + if do_profile: + llm.stop_profile() end = time.perf_counter() return end - start, outputs @@ -142,6 +156,7 @@ async def run_vllm_async( requests: list[SampleRequest], n: int, engine_args: AsyncEngineArgs, + do_profile: bool, disable_frontend_multiprocessing: bool = False, disable_detokenize: bool = False, ) -> float: @@ -185,6 +200,8 @@ async def run_vllm_async( generators = [] start = time.perf_counter() + if do_profile: + await llm.start_profile() for i, (prompt, sp, lr) in enumerate(zip(prompts, sampling_params, lora_requests)): generator = llm.generate(prompt, @@ -195,6 +212,8 @@ async def run_vllm_async( all_gens = merge_async_iterators(*generators) async for i, res in all_gens: pass + if do_profile: + await llm.stop_profile() end = time.perf_counter() return end - start @@ -543,6 +562,12 @@ def add_cli_args(parser: argparse.ArgumentParser): type=str, default=None, help="Split of the HF dataset.") + parser.add_argument( + "--profile", + action="store_true", + default=False, + help="Use Torch Profiler. The env variable " + "VLLM_TORCH_PROFILER_DIR must be set to enable profiler.") # prefix repetition dataset prefix_repetition_group = parser.add_argument_group( @@ -600,22 +625,27 @@ def main(args: argparse.Namespace): requests, args.n, AsyncEngineArgs.from_cli_args(args), - args.disable_frontend_multiprocessing, - args.disable_detokenize, + disable_frontend_multiprocessing=args.disable_frontend_multiprocessing, + disable_detokenize=args.disable_detokenize, + do_profile=args.profile, )) else: elapsed_time, request_outputs = run_vllm( requests, args.n, EngineArgs.from_cli_args(args), - args.disable_detokenize) + disable_detokenize=args.disable_detokenize, + do_profile=args.profile) elif args.backend == "hf": assert args.tensor_parallel_size == 1 + if args.profile: + raise NotImplementedError( + "Profiling not implemented yet for backend='hf'.") elapsed_time = run_hf(requests, args.model, tokenizer, args.n, args.hf_max_batch_size, args.trust_remote_code, args.disable_detokenize) elif args.backend == "vllm-chat": elapsed_time, request_outputs = run_vllm_chat( requests, args.n, EngineArgs.from_cli_args(args), - args.disable_detokenize) + disable_detokenize=args.disable_detokenize, do_profile=args.profile) else: raise ValueError(f"Unknown backend: {args.backend}")