[Doc] Add engine args back in to the docs (#20674)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@ -146,6 +146,7 @@ venv.bak/
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
docs/argparse
|
||||
docs/examples
|
||||
|
||||
# mypy
|
||||
|
||||
@ -1,3 +1,7 @@
|
||||
---
|
||||
toc_depth: 3
|
||||
---
|
||||
|
||||
# Engine Arguments
|
||||
|
||||
Engine arguments control the behavior of the vLLM engine.
|
||||
@ -5,11 +9,12 @@ Engine arguments control the behavior of the vLLM engine.
|
||||
- For [offline inference](../serving/offline_inference.md), they are part of the arguments to [LLM][vllm.LLM] class.
|
||||
- For [online serving](../serving/openai_compatible_server.md), they are part of the arguments to `vllm serve`.
|
||||
|
||||
You can look at [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs] to see the available engine arguments.
|
||||
The engine argument classes, [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs], are a combination of the configuration classes defined in [vllm.config][]. Therefore, if you are interested in developer documentation, we recommend looking at these configuration classes as they are the source of truth for types, defaults and docstrings.
|
||||
|
||||
However, these classes are a combination of the configuration classes defined in [vllm.config][]. Therefore, we would recommend you read about them there where they are best documented.
|
||||
## `EngineArgs`
|
||||
|
||||
For offline inference you will have access to these configuration classes and for online serving you can cross-reference the configs with `vllm serve --help`, which has its arguments grouped by config.
|
||||
--8<-- "docs/argparse/engine_args.md"
|
||||
|
||||
!!! note
|
||||
Additional arguments are available to the [AsyncLLMEngine][vllm.engine.async_llm_engine.AsyncLLMEngine] which is used for online serving. These can be found by running `vllm serve --help`
|
||||
## `AsyncEngineArgs`
|
||||
|
||||
--8<-- "docs/argparse/async_engine_args.md"
|
||||
|
||||
105
docs/mkdocs/hooks/generate_argparse.py
Normal file
105
docs/mkdocs/hooks/generate_argparse.py
Normal file
@ -0,0 +1,105 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import logging
|
||||
import sys
|
||||
from argparse import SUPPRESS, HelpFormatter
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
ROOT_DIR = Path(__file__).parent.parent.parent.parent
|
||||
ARGPARSE_DOC_DIR = ROOT_DIR / "docs/argparse"
|
||||
|
||||
sys.path.insert(0, str(ROOT_DIR))
|
||||
sys.modules["aiohttp"] = MagicMock()
|
||||
sys.modules["blake3"] = MagicMock()
|
||||
sys.modules["vllm._C"] = MagicMock()
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402
|
||||
from vllm.utils import FlexibleArgumentParser # noqa: E402
|
||||
|
||||
logger = logging.getLogger("mkdocs")
|
||||
|
||||
|
||||
class MarkdownFormatter(HelpFormatter):
|
||||
"""Custom formatter that generates markdown for argument groups."""
|
||||
|
||||
def __init__(self, prog):
|
||||
super().__init__(prog,
|
||||
max_help_position=float('inf'),
|
||||
width=float('inf'))
|
||||
self._markdown_output = []
|
||||
|
||||
def start_section(self, heading):
|
||||
if heading not in {"positional arguments", "options"}:
|
||||
self._markdown_output.append(f"\n### {heading}\n\n")
|
||||
|
||||
def end_section(self):
|
||||
pass
|
||||
|
||||
def add_text(self, text):
|
||||
if text:
|
||||
self._markdown_output.append(f"{text.strip()}\n\n")
|
||||
|
||||
def add_usage(self, usage, actions, groups, prefix=None):
|
||||
pass
|
||||
|
||||
def add_arguments(self, actions):
|
||||
for action in actions:
|
||||
|
||||
option_strings = f'`{"`, `".join(action.option_strings)}`'
|
||||
self._markdown_output.append(f"#### {option_strings}\n\n")
|
||||
|
||||
if choices := action.choices:
|
||||
choices = f'`{"`, `".join(str(c) for c in choices)}`'
|
||||
self._markdown_output.append(
|
||||
f"Possible choices: {choices}\n\n")
|
||||
|
||||
self._markdown_output.append(f"{action.help}\n\n")
|
||||
|
||||
if (default := action.default) != SUPPRESS:
|
||||
self._markdown_output.append(f"Default: `{default}`\n\n")
|
||||
|
||||
def format_help(self):
|
||||
"""Return the formatted help as markdown."""
|
||||
return "".join(self._markdown_output)
|
||||
|
||||
|
||||
def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
|
||||
"""Create a parser for the given class with markdown formatting.
|
||||
|
||||
Args:
|
||||
cls: The class to create a parser for
|
||||
**kwargs: Additional keyword arguments to pass to `cls.add_cli_args`.
|
||||
|
||||
Returns:
|
||||
FlexibleArgumentParser: A parser with markdown formatting for the class.
|
||||
"""
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.formatter_class = MarkdownFormatter
|
||||
with patch("vllm.config.DeviceConfig.__post_init__"):
|
||||
return cls.add_cli_args(parser, **kwargs)
|
||||
|
||||
|
||||
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
logger.info("Generating argparse documentation")
|
||||
logger.debug("Root directory: %s", ROOT_DIR.resolve())
|
||||
logger.debug("Output directory: %s", ARGPARSE_DOC_DIR.resolve())
|
||||
|
||||
# Create the ARGPARSE_DOC_DIR if it doesn't exist
|
||||
if not ARGPARSE_DOC_DIR.exists():
|
||||
ARGPARSE_DOC_DIR.mkdir(parents=True)
|
||||
|
||||
# Create parsers to document
|
||||
parsers = {
|
||||
"engine_args": create_parser(EngineArgs),
|
||||
"async_engine_args": create_parser(AsyncEngineArgs,
|
||||
async_args_only=True),
|
||||
}
|
||||
|
||||
# Generate documentation for each parser
|
||||
for stem, parser in parsers.items():
|
||||
doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
|
||||
with open(doc_path, "w") as f:
|
||||
f.write(parser.format_help())
|
||||
logger.info("Argparse generated: %s", doc_path.relative_to(ROOT_DIR))
|
||||
@ -161,8 +161,8 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
for example in sorted(examples, key=lambda e: e.path.stem):
|
||||
example_name = f"{example.path.stem}.md"
|
||||
doc_path = EXAMPLE_DOC_DIR / example.category / example_name
|
||||
logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
|
||||
if not doc_path.parent.exists():
|
||||
doc_path.parent.mkdir(parents=True)
|
||||
with open(doc_path, "w+") as f:
|
||||
f.write(example.generate())
|
||||
logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
|
||||
|
||||
21
docs/mkdocs/overrides/partials/toc-item.html
Normal file
21
docs/mkdocs/overrides/partials/toc-item.html
Normal file
@ -0,0 +1,21 @@
|
||||
<!-- Enables the use of toc_depth in document frontmatter https://github.com/squidfunk/mkdocs-material/issues/4827#issuecomment-1869812019 -->
|
||||
<li class="md-nav__item">
|
||||
<a href="{{ toc_item.url }}" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
{{ toc_item.title }}
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<!-- Table of contents list -->
|
||||
{% if toc_item.children %}
|
||||
<nav class="md-nav" aria-label="{{ toc_item.title | striptags }}">
|
||||
<ul class="md-nav__list">
|
||||
{% for toc_item in toc_item.children %}
|
||||
{% if not page.meta.toc_depth or toc_item.level <= page.meta.toc_depth %}
|
||||
{% include "partials/toc-item.html" %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</nav>
|
||||
{% endif %}
|
||||
</li>
|
||||
@ -3,6 +3,7 @@ site_url: https://docs.vllm.ai
|
||||
repo_url: https://github.com/vllm-project/vllm
|
||||
edit_uri: edit/main/docs/
|
||||
exclude_docs: |
|
||||
argparse
|
||||
*.inc.md
|
||||
*.template.md
|
||||
theme:
|
||||
@ -47,6 +48,7 @@ theme:
|
||||
hooks:
|
||||
- docs/mkdocs/hooks/remove_announcement.py
|
||||
- docs/mkdocs/hooks/generate_examples.py
|
||||
- docs/mkdocs/hooks/generate_argparse.py
|
||||
- docs/mkdocs/hooks/url_schemes.py
|
||||
|
||||
# Required to stop api-autonav from raising an error
|
||||
|
||||
@ -7,3 +7,18 @@ mkdocs-awesome-nav
|
||||
python-markdown-math
|
||||
regex
|
||||
ruff
|
||||
|
||||
# Required for argparse hook only
|
||||
-f https://download.pytorch.org/whl/cpu
|
||||
cachetools
|
||||
cloudpickle
|
||||
fastapi
|
||||
msgspec
|
||||
openai
|
||||
pillow
|
||||
psutil
|
||||
pybase64
|
||||
pydantic
|
||||
torch
|
||||
transformers
|
||||
zmq
|
||||
|
||||
@ -12,8 +12,9 @@ import threading
|
||||
import warnings
|
||||
from dataclasses import MISSING, dataclass, fields, is_dataclass
|
||||
from itertools import permutations
|
||||
from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional,
|
||||
Type, TypeVar, Union, cast, get_args, get_origin)
|
||||
from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
|
||||
Literal, Optional, Type, TypeVar, Union, cast, get_args,
|
||||
get_origin)
|
||||
|
||||
import regex as re
|
||||
import torch
|
||||
@ -33,20 +34,26 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
|
||||
SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
|
||||
TaskOption, TokenizerMode, TokenizerPoolConfig,
|
||||
VllmConfig, get_attr_docs, get_field)
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||
from vllm.platforms import CpuArchEnum, current_platform
|
||||
from vllm.plugins import load_general_plugins
|
||||
from vllm.reasoning import ReasoningParserManager
|
||||
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
|
||||
GiB_bytes, get_ip, is_in_ray_actor)
|
||||
|
||||
# yapf: enable
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
else:
|
||||
ExecutorBase = Any
|
||||
QuantizationMethods = Any
|
||||
UsageContext = Any
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# object is used to allow for special typing forms
|
||||
@ -200,14 +207,17 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
|
||||
kwargs[name] = {"default": default, "help": help}
|
||||
|
||||
# Set other kwargs based on the type hints
|
||||
json_tip = """\n\nShould either be a valid JSON string or JSON keys
|
||||
passed individually. For example, the following sets of arguments are
|
||||
equivalent:\n\n
|
||||
- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n
|
||||
- `--json-arg.key1 value1 --json-arg.key2.key3 value2`\n
|
||||
Additionally, list elements can be passed individually using '+':
|
||||
- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`\n
|
||||
- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`\n\n"""
|
||||
json_tip = """Should either be a valid JSON string or JSON keys
|
||||
passed individually. For example, the following sets of arguments are
|
||||
equivalent:
|
||||
|
||||
- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n
|
||||
- `--json-arg.key1 value1 --json-arg.key2.key3 value2`
|
||||
|
||||
Additionally, list elements can be passed individually using `+`:
|
||||
|
||||
- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`\n
|
||||
- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`"""
|
||||
if dataclass_cls is not None:
|
||||
|
||||
def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
|
||||
@ -219,7 +229,7 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
|
||||
raise argparse.ArgumentTypeError(repr(e)) from e
|
||||
|
||||
kwargs[name]["type"] = parse_dataclass
|
||||
kwargs[name]["help"] += json_tip
|
||||
kwargs[name]["help"] += f"\n\n{json_tip}"
|
||||
elif contains_type(type_hints, bool):
|
||||
# Creates --no-<name> and --<name> flags
|
||||
kwargs[name]["action"] = argparse.BooleanOptionalAction
|
||||
@ -255,7 +265,7 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
|
||||
kwargs[name]["type"] = union_dict_and_str
|
||||
elif contains_type(type_hints, dict):
|
||||
kwargs[name]["type"] = parse_type(json.loads)
|
||||
kwargs[name]["help"] += json_tip
|
||||
kwargs[name]["help"] += f"\n\n{json_tip}"
|
||||
elif (contains_type(type_hints, str)
|
||||
or any(is_not_builtin(th) for th in type_hints)):
|
||||
kwargs[name]["type"] = str
|
||||
@ -1545,7 +1555,6 @@ class EngineArgs:
|
||||
# Enable chunked prefill by default for long context (> 32K)
|
||||
# models to avoid OOM errors in initial memory profiling phase.
|
||||
elif use_long_context:
|
||||
from vllm.platforms import current_platform
|
||||
is_gpu = current_platform.is_cuda()
|
||||
use_sliding_window = (model_config.get_sliding_window()
|
||||
is not None)
|
||||
@ -1653,6 +1662,7 @@ class EngineArgs:
|
||||
# NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
|
||||
# throughput, see PR #17885 for more details.
|
||||
# So here we do an extra device name check to prevent such regression.
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
|
||||
# For GPUs like H100 and MI300x, use larger default values.
|
||||
default_max_num_batched_tokens = {
|
||||
|
||||
@ -38,7 +38,6 @@ from typing_extensions import Required, TypeAlias, TypedDict
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.model_loader import get_model_cls
|
||||
from vllm.model_executor.models import SupportsMultiModal
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
|
||||
from vllm.multimodal.utils import MediaConnector
|
||||
@ -524,6 +523,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
|
||||
@cached_property
|
||||
def model_cls(self):
|
||||
from vllm.model_executor.model_loader import get_model_cls
|
||||
return get_model_cls(self.model_config)
|
||||
|
||||
@property
|
||||
|
||||
@ -13,7 +13,6 @@ from typing_extensions import TypeVar
|
||||
from vllm.jsontree import JSONTree, json_map_leaves
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.processor import cached_processor_from_config
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.utils import resolve_mm_processor_kwargs
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -21,6 +20,14 @@ if TYPE_CHECKING:
|
||||
from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict,
|
||||
MultiModalRegistry)
|
||||
from vllm.sequence import SequenceData
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
else:
|
||||
ModelConfig = Any
|
||||
MultiModalDataDict = Any
|
||||
MultiModalPlaceholderDict = Any
|
||||
MultiModalRegistry = Any
|
||||
SequenceData = Any
|
||||
AnyTokenizer = Any
|
||||
|
||||
_T = TypeVar("_T")
|
||||
_C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
|
||||
@ -36,7 +43,7 @@ class InputContext:
|
||||
modify the inputs.
|
||||
"""
|
||||
|
||||
model_config: "ModelConfig"
|
||||
model_config: ModelConfig
|
||||
"""The configuration of the model."""
|
||||
|
||||
def get_hf_config(
|
||||
@ -200,9 +207,9 @@ class DummyData(NamedTuple):
|
||||
Note: This is only used in V0.
|
||||
"""
|
||||
|
||||
seq_data: "SequenceData"
|
||||
multi_modal_data: Optional["MultiModalDataDict"] = None
|
||||
multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None
|
||||
seq_data: SequenceData
|
||||
multi_modal_data: Optional[MultiModalDataDict] = None
|
||||
multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
|
||||
|
||||
|
||||
class InputRegistry:
|
||||
@ -212,9 +219,9 @@ class InputRegistry:
|
||||
|
||||
def dummy_data_for_profiling(
|
||||
self,
|
||||
model_config: "ModelConfig",
|
||||
model_config: ModelConfig,
|
||||
seq_len: int,
|
||||
mm_registry: "MultiModalRegistry",
|
||||
mm_registry: MultiModalRegistry,
|
||||
is_encoder_data: bool = False,
|
||||
) -> DummyData:
|
||||
"""
|
||||
|
||||
@ -16,7 +16,6 @@ from dataclasses import dataclass, field
|
||||
from functools import lru_cache
|
||||
from typing import Callable, Optional, TypeVar, Union
|
||||
|
||||
import cloudpickle
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.logger import init_logger
|
||||
@ -598,6 +597,7 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T:
|
||||
output_filepath = os.path.join(tempdir, "registry_output.tmp")
|
||||
|
||||
# `cloudpickle` allows pickling lambda functions directly
|
||||
import cloudpickle
|
||||
input_bytes = cloudpickle.dumps((fn, output_filepath))
|
||||
|
||||
# cannot use `sys.executable __file__` here because the script
|
||||
|
||||
@ -7,7 +7,6 @@ import sys
|
||||
from importlib.util import find_spec
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
import psutil
|
||||
import torch
|
||||
|
||||
from vllm.logger import init_logger
|
||||
@ -73,6 +72,7 @@ class CpuPlatform(Platform):
|
||||
|
||||
@classmethod
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
import psutil
|
||||
return psutil.virtual_memory().total
|
||||
|
||||
@classmethod
|
||||
|
||||
@ -7,14 +7,22 @@ import os
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Sequence
|
||||
from functools import cached_property
|
||||
from typing import Callable, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
DeltaMessage, ResponsesRequest)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.utils import import_from_path, is_list_of
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
DeltaMessage,
|
||||
ResponsesRequest)
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
else:
|
||||
ChatCompletionRequest = Any
|
||||
DeltaMessage = Any
|
||||
ResponsesRequest = Any
|
||||
AnyTokenizer = Any
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
|
||||
@ -16,15 +16,18 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||
|
||||
from vllm import envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizer_base import (TokenizerBase,
|
||||
TokenizerRegistry)
|
||||
from vllm.transformers_utils.tokenizers import MistralTokenizer
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
from vllm.utils import make_async
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizer_base import TokenizerBase
|
||||
else:
|
||||
ModelConfig = Any
|
||||
LoRARequest = Any
|
||||
TokenizerBase = Any
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -222,6 +225,7 @@ def get_tokenizer(
|
||||
tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
|
||||
revision=revision)
|
||||
elif tokenizer_mode == "custom":
|
||||
from vllm.transformers_utils.tokenizer_base import TokenizerRegistry
|
||||
tokenizer = TokenizerRegistry.get_tokenizer(str(tokenizer_name),
|
||||
*args,
|
||||
revision=revision,
|
||||
@ -271,7 +275,7 @@ cached_get_tokenizer = lru_cache(get_tokenizer)
|
||||
|
||||
|
||||
def cached_tokenizer_from_config(
|
||||
model_config: "ModelConfig",
|
||||
model_config: ModelConfig,
|
||||
**kwargs: Any,
|
||||
):
|
||||
return cached_get_tokenizer(
|
||||
|
||||
Reference in New Issue
Block a user