diff --git a/docs/.nav.yml b/docs/.nav.yml index f57703c329..acedc32c30 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -11,7 +11,7 @@ nav: - Quick Links: - User Guide: usage/README.md - Developer Guide: contributing/README.md - - API Reference: api/summary.md + - API Reference: api/README.md - CLI Reference: cli/README.md - Timeline: - Roadmap: https://roadmap.vllm.ai @@ -58,11 +58,9 @@ nav: - CI: contributing/ci - Design Documents: design - API Reference: - - Summary: api/summary.md - - Contents: - - api/vllm/* - - CLI Reference: - - Summary: cli/README.md + - api/README.md + - api/vllm/* + - CLI Reference: cli - Community: - community/* - Blog: https://blog.vllm.ai diff --git a/docs/api/summary.md b/docs/api/README.md similarity index 100% rename from docs/api/summary.md rename to docs/api/README.md diff --git a/docs/cli/.meta.yml b/docs/cli/.meta.yml new file mode 100644 index 0000000000..0e1f7eccee --- /dev/null +++ b/docs/cli/.meta.yml @@ -0,0 +1 @@ +toc_depth: 3 \ No newline at end of file diff --git a/docs/cli/.nav.yml b/docs/cli/.nav.yml new file mode 100644 index 0000000000..6c2c09d566 --- /dev/null +++ b/docs/cli/.nav.yml @@ -0,0 +1,8 @@ +nav: + - README.md + - serve.md + - chat.md + - complete.md + - run-batch.md + - vllm bench: + - bench/*.md diff --git a/docs/cli/README.md b/docs/cli/README.md index b512a4f4ba..c708eb7958 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -1,7 +1,3 @@ ---- -toc_depth: 4 ---- - # vLLM CLI Guide The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with: @@ -16,52 +12,48 @@ Available Commands: vllm {chat,complete,serve,bench,collect-env,run-batch} ``` -When passing JSON CLI arguments, the following sets of arguments are equivalent: - -- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'` -- `--json-arg.key1 value1 --json-arg.key2.key3 value2` - -Additionally, list elements can be passed individually using `+`: - -- `--json-arg '{"key4": ["value3", "value4", "value5"]}'` -- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'` - ## serve -Start the vLLM OpenAI Compatible API server. +Starts the vLLM OpenAI Compatible API server. -??? console "Examples" +Start with a model: - ```bash - # Start with a model - vllm serve meta-llama/Llama-2-7b-hf +```bash +vllm serve meta-llama/Llama-2-7b-hf +``` - # Specify the port - vllm serve meta-llama/Llama-2-7b-hf --port 8100 +Specify the port: - # Serve over a Unix domain socket - vllm serve meta-llama/Llama-2-7b-hf --uds /tmp/vllm.sock +```bash +vllm serve meta-llama/Llama-2-7b-hf --port 8100 +``` - # Check with --help for more options - # To list all groups - vllm serve --help=listgroup +Serve over a Unix domain socket: - # To view a argument group - vllm serve --help=ModelConfig +```bash +vllm serve meta-llama/Llama-2-7b-hf --uds /tmp/vllm.sock +``` - # To view a single argument - vllm serve --help=max-num-seqs +Check with --help for more options: - # To search by keyword - vllm serve --help=max +```bash +# To list all groups +vllm serve --help=listgroup - # To view full help with pager (less/more) - vllm serve --help=page - ``` +# To view a argument group +vllm serve --help=ModelConfig -### Options +# To view a single argument +vllm serve --help=max-num-seqs ---8<-- "docs/argparse/serve.md" +# To search by keyword +vllm serve --help=max + +# To view full help with pager (less/more) +vllm serve --help=page +``` + +See [vllm serve](./serve.md) for the full reference of all available arguments. ## chat @@ -78,6 +70,8 @@ vllm chat --url http://{vllm-serve-host}:{vllm-serve-port}/v1 vllm chat --quick "hi" ``` +See [vllm chat](./chat.md) for the full reference of all available arguments. + ## complete Generate text completions based on the given prompt via the running API server. @@ -93,7 +87,7 @@ vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1 vllm complete --quick "The future of AI is" ``` - +See [vllm complete](./complete.md) for the full reference of all available arguments. ## bench @@ -120,6 +114,8 @@ vllm bench latency \ --load-format dummy ``` +See [vllm bench latency](./bench/latency.md) for the full reference of all available arguments. + ### serve Benchmark the online serving throughput. @@ -134,6 +130,8 @@ vllm bench serve \ --num-prompts 5 ``` +See [vllm bench serve](./bench/serve.md) for the full reference of all available arguments. + ### throughput Benchmark offline inference throughput. @@ -147,6 +145,8 @@ vllm bench throughput \ --load-format dummy ``` +See [vllm bench throughput](./bench/throughput.md) for the full reference of all available arguments. + ## collect-env Start collecting environment information. @@ -159,24 +159,25 @@ vllm collect-env Run batch prompts and write results to file. -
-Examples +Running with a local file: ```bash -# Running with a local file vllm run-batch \ -i offline_inference/openai_batch/openai_example_batch.jsonl \ -o results.jsonl \ --model meta-llama/Meta-Llama-3-8B-Instruct +``` -# Using remote file +Using remote file: + +```bash vllm run-batch \ -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ -o results.jsonl \ --model meta-llama/Meta-Llama-3-8B-Instruct ``` -
+See [vllm run-batch](./run-batch.md) for the full reference of all available arguments. ## More Help diff --git a/docs/cli/bench/latency.md b/docs/cli/bench/latency.md new file mode 100644 index 0000000000..21ab13e637 --- /dev/null +++ b/docs/cli/bench/latency.md @@ -0,0 +1,9 @@ +# vllm bench latency + +## JSON CLI Arguments + +--8<-- "docs/cli/json_tip.inc.md" + +## Options + +--8<-- "docs/argparse/bench_latency.md" diff --git a/docs/cli/bench/serve.md b/docs/cli/bench/serve.md new file mode 100644 index 0000000000..f7c415c6be --- /dev/null +++ b/docs/cli/bench/serve.md @@ -0,0 +1,9 @@ +# vllm bench serve + +## JSON CLI Arguments + +--8<-- "docs/cli/json_tip.inc.md" + +## Options + +--8<-- "docs/argparse/bench_serve.md" diff --git a/docs/cli/bench/throughput.md b/docs/cli/bench/throughput.md new file mode 100644 index 0000000000..e4ff5ce43c --- /dev/null +++ b/docs/cli/bench/throughput.md @@ -0,0 +1,9 @@ +# vllm bench throughput + +## JSON CLI Arguments + +--8<-- "docs/cli/json_tip.inc.md" + +## Options + +--8<-- "docs/argparse/bench_throughput.md" diff --git a/docs/cli/chat.md b/docs/cli/chat.md new file mode 100644 index 0000000000..b006cb8de6 --- /dev/null +++ b/docs/cli/chat.md @@ -0,0 +1,5 @@ +# vllm chat + +## Options + +--8<-- "docs/argparse/chat.md" diff --git a/docs/cli/complete.md b/docs/cli/complete.md new file mode 100644 index 0000000000..400359acf4 --- /dev/null +++ b/docs/cli/complete.md @@ -0,0 +1,5 @@ +# vllm complete + +## Options + +--8<-- "docs/argparse/complete.md" diff --git a/docs/cli/json_tip.inc.md b/docs/cli/json_tip.inc.md new file mode 100644 index 0000000000..c22430c264 --- /dev/null +++ b/docs/cli/json_tip.inc.md @@ -0,0 +1,9 @@ +When passing JSON CLI arguments, the following sets of arguments are equivalent: + +- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'` +- `--json-arg.key1 value1 --json-arg.key2.key3 value2` + +Additionally, list elements can be passed individually using `+`: + +- `--json-arg '{"key4": ["value3", "value4", "value5"]}'` +- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'` \ No newline at end of file diff --git a/docs/cli/run-batch.md b/docs/cli/run-batch.md new file mode 100644 index 0000000000..f7d401b8da --- /dev/null +++ b/docs/cli/run-batch.md @@ -0,0 +1,9 @@ +# vllm run-batch + +## JSON CLI Arguments + +--8<-- "docs/cli/json_tip.inc.md" + +## Options + +--8<-- "docs/argparse/run-batch.md" diff --git a/docs/cli/serve.md b/docs/cli/serve.md new file mode 100644 index 0000000000..2c8f9d320f --- /dev/null +++ b/docs/cli/serve.md @@ -0,0 +1,9 @@ +# vllm serve + +## JSON CLI Arguments + +--8<-- "docs/cli/json_tip.inc.md" + +## Options + +--8<-- "docs/argparse/serve.md" diff --git a/docs/configuration/engine_args.md b/docs/configuration/engine_args.md index e7ca08b557..05d4f76230 100644 --- a/docs/configuration/engine_args.md +++ b/docs/configuration/engine_args.md @@ -11,15 +11,7 @@ Engine arguments control the behavior of the vLLM engine. The engine argument classes, [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs], are a combination of the configuration classes defined in [vllm.config][]. Therefore, if you are interested in developer documentation, we recommend looking at these configuration classes as they are the source of truth for types, defaults and docstrings. -When passing JSON CLI arguments, the following sets of arguments are equivalent: - -- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'` -- `--json-arg.key1 value1 --json-arg.key2.key3 value2` - -Additionally, list elements can be passed individually using `+`: - -- `--json-arg '{"key4": ["value3", "value4", "value5"]}'` -- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'` +--8<-- "docs/cli/json_tip.inc.md" ## `EngineArgs` diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index b003b5fd6c..ed5d3b0092 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -15,8 +15,14 @@ sys.modules["aiohttp"] = MagicMock() sys.modules["blake3"] = MagicMock() sys.modules["vllm._C"] = MagicMock() +from vllm.benchmarks import latency # noqa: E402 +from vllm.benchmarks import serve # noqa: E402 +from vllm.benchmarks import throughput # noqa: E402 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402 -from vllm.entrypoints.openai.cli_args import make_arg_parser # noqa: E402 +from vllm.entrypoints.cli.openai import ChatCommand # noqa: E402 +from vllm.entrypoints.cli.openai import CompleteCommand # noqa: E402 +from vllm.entrypoints.openai import cli_args # noqa: E402 +from vllm.entrypoints.openai import run_batch # noqa: E402 from vllm.utils import FlexibleArgumentParser # noqa: E402 logger = logging.getLogger("mkdocs") @@ -68,7 +74,8 @@ class MarkdownFormatter(HelpFormatter): self._markdown_output.append( f"Possible choices: {metavar}\n\n") - self._markdown_output.append(f"{action.help}\n\n") + if action.help: + self._markdown_output.append(f"{action.help}\n\n") if (default := action.default) != SUPPRESS: self._markdown_output.append(f"Default: `{default}`\n\n") @@ -78,7 +85,7 @@ class MarkdownFormatter(HelpFormatter): return "".join(self._markdown_output) -def create_parser(cls, **kwargs) -> FlexibleArgumentParser: +def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser: """Create a parser for the given class with markdown formatting. Args: @@ -88,18 +95,12 @@ def create_parser(cls, **kwargs) -> FlexibleArgumentParser: Returns: FlexibleArgumentParser: A parser with markdown formatting for the class. """ - parser = FlexibleArgumentParser() + parser = FlexibleArgumentParser(add_json_tip=False) parser.formatter_class = MarkdownFormatter with patch("vllm.config.DeviceConfig.__post_init__"): - return cls.add_cli_args(parser, **kwargs) - - -def create_serve_parser() -> FlexibleArgumentParser: - """Create a parser for the serve command with markdown formatting.""" - parser = FlexibleArgumentParser() - parser.formatter_class = lambda prog: MarkdownFormatter( - prog, starting_heading_level=4) - return make_arg_parser(parser) + _parser = add_cli_args(parser, **kwargs) + # add_cli_args might be in-place so return parser if _parser is None + return _parser or parser def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): @@ -113,10 +114,24 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): # Create parsers to document parsers = { - "engine_args": create_parser(EngineArgs), - "async_engine_args": create_parser(AsyncEngineArgs, - async_args_only=True), - "serve": create_serve_parser(), + "engine_args": + create_parser(EngineArgs.add_cli_args), + "async_engine_args": + create_parser(AsyncEngineArgs.add_cli_args, async_args_only=True), + "serve": + create_parser(cli_args.make_arg_parser), + "chat": + create_parser(ChatCommand.add_cli_args), + "complete": + create_parser(CompleteCommand.add_cli_args), + "bench_latency": + create_parser(latency.add_cli_args), + "bench_throughput": + create_parser(throughput.add_cli_args), + "bench_serve": + create_parser(serve.add_cli_args), + "run-batch": + create_parser(run_batch.make_arg_parser), } # Generate documentation for each parser diff --git a/requirements/docs.txt b/requirements/docs.txt index c589093110..a24b9c7e92 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -29,3 +29,5 @@ setproctitle torch transformers zmq +uvloop +prometheus-client diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index bbd18ca3ae..fdf6548ada 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -24,8 +24,6 @@ from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset, from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, write_to_json) from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs -from vllm.entrypoints.openai.api_server import ( - build_async_engine_client_from_engine_args) from vllm.inputs import TextPrompt, TokensPrompt from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput @@ -146,6 +144,8 @@ async def run_vllm_async( disable_detokenize: bool = False, ) -> float: from vllm import SamplingParams + from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args) async with build_async_engine_client_from_engine_args( engine_args, diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py index e71f77ba80..7c01de94a3 100644 --- a/vllm/entrypoints/cli/openai.py +++ b/vllm/entrypoints/cli/openai.py @@ -130,28 +130,33 @@ class ChatCommand(CLISubcommand): conversation.append(response_message) # type: ignore print(output) - def subparser_init( - self, - subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser: - chat_parser = subparsers.add_parser( - "chat", - help="Generate chat completions via the running API server.", - description="Generate chat completions via the running API server.", - usage="vllm chat [options]") - _add_query_options(chat_parser) - chat_parser.add_argument( + @staticmethod + def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + """Add CLI arguments for the chat command.""" + _add_query_options(parser) + parser.add_argument( "--system-prompt", type=str, default=None, help=("The system prompt to be added to the chat template, " "used for models that support system prompts.")) - chat_parser.add_argument("-q", - "--quick", - type=str, - metavar="MESSAGE", - help=("Send a single prompt as MESSAGE " - "and print the response, then exit.")) - return chat_parser + parser.add_argument("-q", + "--quick", + type=str, + metavar="MESSAGE", + help=("Send a single prompt as MESSAGE " + "and print the response, then exit.")) + return parser + + def subparser_init( + self, + subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser: + parser = subparsers.add_parser( + "chat", + help="Generate chat completions via the running API server.", + description="Generate chat completions via the running API server.", + usage="vllm chat [options]") + return ChatCommand.add_cli_args(parser) class CompleteCommand(CLISubcommand): @@ -179,25 +184,30 @@ class CompleteCommand(CLISubcommand): output = completion.choices[0].text print(output) - def subparser_init( - self, - subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser: - complete_parser = subparsers.add_parser( - "complete", - help=("Generate text completions based on the given prompt " - "via the running API server."), - description=("Generate text completions based on the given prompt " - "via the running API server."), - usage="vllm complete [options]") - _add_query_options(complete_parser) - complete_parser.add_argument( + @staticmethod + def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + """Add CLI arguments for the complete command.""" + _add_query_options(parser) + parser.add_argument( "-q", "--quick", type=str, metavar="PROMPT", help= "Send a single prompt and print the completion output, then exit.") - return complete_parser + return parser + + def subparser_init( + self, + subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser: + parser = subparsers.add_parser( + "complete", + help=("Generate text completions based on the given prompt " + "via the running API server."), + description=("Generate text completions based on the given prompt " + "via the running API server."), + usage="vllm complete [options]") + return CompleteCommand.add_cli_args(parser) def cmd_init() -> list[CLISubcommand]: diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index a10d57456b..01551a8c7f 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -20,7 +20,6 @@ from vllm.engine.arg_utils import AsyncEngineArgs, optional_type from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger # yapf: disable -from vllm.entrypoints.openai.api_server import build_async_engine_client from vllm.entrypoints.openai.protocol import (BatchRequestInput, BatchRequestOutput, BatchResponseData, @@ -34,7 +33,6 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath, OpenAIServingModels) from vllm.entrypoints.openai.serving_score import ServingScores from vllm.logger import init_logger -from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, random_uuid from vllm.version import __version__ as VLLM_VERSION @@ -469,6 +467,9 @@ async def run_batch( async def main(args: Namespace): + from vllm.entrypoints.openai.api_server import build_async_engine_client + from vllm.usage.usage_lib import UsageContext + async with build_async_engine_client( args, usage_context=UsageContext.OPENAI_BATCH_RUNNER, diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index a4997226ea..095829db83 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1682,6 +1682,8 @@ class FlexibleArgumentParser(ArgumentParser): # Set the default "formatter_class" to SortedHelpFormatter if "formatter_class" not in kwargs: kwargs["formatter_class"] = SortedHelpFormatter + # Pop kwarg "add_json_tip" to control whether to add the JSON tip + self.add_json_tip = kwargs.pop("add_json_tip", True) super().__init__(*args, **kwargs) if sys.version_info < (3, 13): @@ -1726,7 +1728,8 @@ class FlexibleArgumentParser(ArgumentParser): def format_help(self) -> str: # Add tip about JSON arguments to the epilog epilog = self.epilog or "" - if not epilog.startswith(FlexibleArgumentParser._json_tip): + if (self.add_json_tip + and not epilog.startswith(FlexibleArgumentParser._json_tip)): self.epilog = FlexibleArgumentParser._json_tip + epilog return super().format_help()