Compare commits
1 Commits
codex/upda
...
benchmark_
| Author | SHA1 | Date | |
|---|---|---|---|
| 221118dc85 |
@ -275,7 +275,7 @@ async def benchmark(
|
||||
model_id: str,
|
||||
model_name: str,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
input_requests: list[SampleRequest],
|
||||
requests: list[SampleRequest],
|
||||
logprobs: Optional[int],
|
||||
request_rate: float,
|
||||
burstiness: float,
|
||||
@ -295,12 +295,14 @@ async def benchmark(
|
||||
raise ValueError(f"Unknown backend: {backend}")
|
||||
|
||||
print("Starting initial single prompt test run...")
|
||||
last_idx = len(requests) - 1
|
||||
test_prompt, test_prompt_len, test_output_len, test_mm_content = (
|
||||
input_requests[0].prompt,
|
||||
input_requests[0].prompt_len,
|
||||
input_requests[0].expected_output_len,
|
||||
input_requests[0].multi_modal_data,
|
||||
requests[last_idx].prompt,
|
||||
requests[last_idx].prompt_len,
|
||||
requests[last_idx].expected_output_len,
|
||||
requests[last_idx].multi_modal_data,
|
||||
)
|
||||
input_requests = requests[:last_idx]
|
||||
|
||||
assert test_mm_content is None or isinstance(test_mm_content, dict)
|
||||
test_input = RequestFuncInput(
|
||||
@ -615,6 +617,9 @@ def main(args: argparse.Namespace):
|
||||
api_url = f"http://{args.host}:{args.port}{args.endpoint}"
|
||||
base_url = f"http://{args.host}:{args.port}"
|
||||
|
||||
# Create one more request (for a test request)
|
||||
total_prompts = args.num_prompts + 1
|
||||
|
||||
tokenizer = get_tokenizer(
|
||||
tokenizer_id,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
@ -632,7 +637,7 @@ def main(args: argparse.Namespace):
|
||||
# For the "sonnet" dataset, formatting depends on the backend.
|
||||
if args.backend == "openai-chat":
|
||||
input_requests = dataset.sample(
|
||||
num_requests=args.num_prompts,
|
||||
num_requests=total_prompts,
|
||||
input_len=args.sonnet_input_len,
|
||||
output_len=args.sonnet_output_len,
|
||||
prefix_len=args.sonnet_prefix_len,
|
||||
@ -644,7 +649,7 @@ def main(args: argparse.Namespace):
|
||||
"Tokenizer/model must have chat template for sonnet dataset."
|
||||
)
|
||||
input_requests = dataset.sample(
|
||||
num_requests=args.num_prompts,
|
||||
num_requests=total_prompts,
|
||||
input_len=args.sonnet_input_len,
|
||||
output_len=args.sonnet_output_len,
|
||||
prefix_len=args.sonnet_prefix_len,
|
||||
@ -707,7 +712,7 @@ def main(args: argparse.Namespace):
|
||||
dataset_split=args.hf_split,
|
||||
random_seed=args.seed,
|
||||
).sample(
|
||||
num_requests=args.num_prompts,
|
||||
num_requests=total_prompts,
|
||||
tokenizer=tokenizer,
|
||||
output_len=args.hf_output_len,
|
||||
)
|
||||
@ -719,15 +724,15 @@ def main(args: argparse.Namespace):
|
||||
random_seed=args.seed, dataset_path=args.dataset_path
|
||||
).sample(
|
||||
tokenizer=tokenizer,
|
||||
num_requests=args.num_prompts,
|
||||
num_requests=total_prompts,
|
||||
output_len=args.sharegpt_output_len,
|
||||
),
|
||||
"burstgpt": lambda: BurstGPTDataset(
|
||||
random_seed=args.seed, dataset_path=args.dataset_path
|
||||
).sample(tokenizer=tokenizer, num_requests=args.num_prompts),
|
||||
).sample(tokenizer=tokenizer, num_requests=total_prompts),
|
||||
"random": lambda: RandomDataset(dataset_path=args.dataset_path).sample(
|
||||
tokenizer=tokenizer,
|
||||
num_requests=args.num_prompts,
|
||||
num_requests=total_prompts,
|
||||
prefix_len=args.random_prefix_len,
|
||||
input_len=args.random_input_len,
|
||||
output_len=args.random_output_len,
|
||||
@ -774,7 +779,7 @@ def main(args: argparse.Namespace):
|
||||
model_id=model_id,
|
||||
model_name=model_name,
|
||||
tokenizer=tokenizer,
|
||||
input_requests=input_requests,
|
||||
requests=input_requests,
|
||||
logprobs=args.logprobs,
|
||||
request_rate=args.request_rate,
|
||||
burstiness=args.burstiness,
|
||||
|
||||
@ -14,14 +14,8 @@ This document provides an overview of the vLLM architecture.
|
||||
vLLM provides a number of entrypoints for interacting with the system. The
|
||||
following diagram shows the relationship between them.
|
||||
|
||||
:::{mermaid}
|
||||
flowchart TD
|
||||
CLI["vllm CLI"] --> APIServer["OpenAI API Server"]
|
||||
LLM["LLM Class"] --> LLMEngine
|
||||
APIServer --> AsyncLLMEngine
|
||||
LLMEngine --> EngineCoreClient
|
||||
AsyncLLMEngine --> EngineCoreClient
|
||||
EngineCoreClient --> EngineCore
|
||||
:::{image} /assets/design/arch_overview/entrypoints.excalidraw.png
|
||||
:alt: Entrypoints Diagram
|
||||
:::
|
||||
|
||||
### LLM Class
|
||||
@ -90,14 +84,8 @@ More details on the API server can be found in the [OpenAI-Compatible Server](#o
|
||||
The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
|
||||
the vLLM system, handling model inference and asynchronous request processing.
|
||||
|
||||
:::{mermaid}
|
||||
flowchart LR
|
||||
Processor --> EngineCoreClient
|
||||
EngineCoreClient --> EngineCore
|
||||
EngineCore --> Executor
|
||||
Executor --> Worker
|
||||
Worker --> ModelRunner
|
||||
ModelRunner --> Model
|
||||
:::{image} /assets/design/arch_overview/llm_engine.excalidraw.png
|
||||
:alt: LLMEngine Diagram
|
||||
:::
|
||||
|
||||
### LLMEngine
|
||||
@ -116,7 +104,7 @@ processing.
|
||||
- **Output Processing**: Processes the outputs generated by the model, decoding the
|
||||
token IDs from a language model into human-readable text.
|
||||
|
||||
The code for `LLMEngine` can be found in <gh-file:vllm/v1/engine/llm_engine.py>.
|
||||
The code for `LLMEngine` can be found in <gh-file:vllm/engine/llm_engine.py>.
|
||||
|
||||
### AsyncLLMEngine
|
||||
|
||||
@ -128,7 +116,7 @@ can handle multiple concurrent requests and stream outputs to clients.
|
||||
The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
|
||||
API server that serves as a simpler example in <gh-file:vllm/entrypoints/api_server.py>.
|
||||
|
||||
The code for `AsyncLLMEngine` can be found in <gh-file:vllm/v1/engine/async_llm.py>.
|
||||
The code for `AsyncLLMEngine` can be found in <gh-file:vllm/engine/async_llm_engine.py>.
|
||||
|
||||
## Worker
|
||||
|
||||
@ -152,29 +140,15 @@ Every model runner object has one model object, which is the actual
|
||||
`torch.nn.Module` instance. See [huggingface_integration](#huggingface-integration) for how various
|
||||
configurations affect the class we ultimately get.
|
||||
|
||||
## Class Hierarchy and vLLM V1 Architecture
|
||||
## Class Hierarchy
|
||||
|
||||
The following diagram shows how the main classes interact:
|
||||
The following figure shows the class hierarchy of vLLM:
|
||||
|
||||
:::{mermaid}
|
||||
classDiagram
|
||||
class LLMEngine
|
||||
class AsyncLLMEngine
|
||||
class EngineCoreClient
|
||||
class EngineCore
|
||||
class Executor
|
||||
class Worker
|
||||
class ModelRunner
|
||||
class Model
|
||||
|
||||
AsyncLLMEngine --> LLMEngine
|
||||
LLMEngine --> EngineCoreClient
|
||||
EngineCoreClient --> EngineCore
|
||||
EngineCore --> Executor
|
||||
Executor --> Worker
|
||||
Worker --> ModelRunner
|
||||
ModelRunner --> Model
|
||||
:::
|
||||
> :::{figure} /assets/design/hierarchy.png
|
||||
> :align: center
|
||||
> :alt: query
|
||||
> :width: 100%
|
||||
> :::
|
||||
|
||||
There are several important design choices behind this class hierarchy:
|
||||
|
||||
@ -276,32 +250,3 @@ big problem.
|
||||
|
||||
In summary, the complete config object `VllmConfig` can be treated as an
|
||||
engine-level global state that is shared among all vLLM classes.
|
||||
|
||||
vLLM V1 introduces a streamlined engine that splits responsibilities between a thin frontend and a highly optimized backend. The design is centered on three core layers:
|
||||
|
||||
1. **Frontend (`LLMEngine` and `AsyncLLM`)** – user-facing classes that handle tokenization, batching of incoming requests, and postprocessing of generated outputs. These classes interact with the engine core through an `EngineCoreClient`.
|
||||
2. **Engine Core** – the inner loop that schedules requests and runs the model. The core lives in `vllm/v1/engine/core.py` and exposes a lightweight API for adding requests, aborting them, or stepping the model.
|
||||
3. **Executor and Workers** – the executor (for example `MultiprocExecutor` in <gh-file:vllm/v1/executor/multiproc_executor.py>) manages worker processes. Each worker controls a single accelerator device and hosts a `ModelRunner` (such as `GPUModelRunner` in <gh-file:vllm/v1/worker/gpu_model_runner.py>) which executes the forward pass.
|
||||
|
||||
### EngineCore and Scheduler
|
||||
|
||||
The `EngineCore` maintains a [`Scheduler`](<gh-file:vllm/v1/core/sched/scheduler.py>) and a `KVCacheManager` (<gh-file:vllm/v1/core/kv_cache_manager.py>). At each iteration the scheduler chooses how many tokens to process for every active `Request`, supporting features like prefix caching, chunked prefill and speculative decoding. Scheduled tokens are passed to the model runner and the resulting `EngineCoreOutputs` include generated tokens and per-request events.
|
||||
The scheduler keeps separate waiting and running queues and enforces limits from
|
||||
`VllmConfig` such as `max_num_seqs` and `max_num_batched_tokens`. When GPU
|
||||
memory becomes scarce it can preempt lower priority requests, freeing their KV
|
||||
cache blocks before resuming them later. After a step finishes it records
|
||||
statistics and updates each request's progress based on the returned events.
|
||||
|
||||
### Communication via EngineCoreClient
|
||||
|
||||
To overlap computation with I/O, the engine core often runs in a separate process. `EngineCoreClient` (<gh-file:vllm/v1/engine/core_client.py>) forwards requests and pulls results over ZeroMQ sockets. When using multiple data-parallel ranks, `DPAsyncMPClient` manages a set of engine-core processes and aggregates their outputs.
|
||||
|
||||
### Workers and Model Runners
|
||||
|
||||
Workers are defined in <gh-dir:vllm/v1/worker>. The default GPU worker initializes CUDA, sets up distributed communication and hosts a `GPUModelRunner` which loads the model, prepares KV cache memory and executes inference kernels. The runner also handles LoRA adapters, attention backends, and cudagraph capture.
|
||||
|
||||
### Output Processing
|
||||
|
||||
`OutputProcessor` (<gh-file:vllm/v1/engine/output_processor.py>) converts raw `EngineCoreOutputs` into `RequestOutput` objects, assembling logprobs, speculative tokens, and final texts. When using `AsyncLLM`, an asynchronous loop continuously fetches these outputs and streams them back to callers.
|
||||
|
||||
This new layering keeps the hot path (`EngineCore`) minimal while letting the frontend focus on user interactions and request bookkeeping. It reduces CPU overhead and simplifies the addition of new optimizations.
|
||||
|
||||
@ -26,7 +26,6 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
||||
|
||||
class ParallelSetup(NamedTuple):
|
||||
tp_size: int
|
||||
pp_size: int
|
||||
sp_enabled: bool
|
||||
eager_mode: bool
|
||||
chunked_prefill: bool
|
||||
@ -61,7 +60,6 @@ class SPTestSettings:
|
||||
def detailed(
|
||||
*,
|
||||
tp_base: int = 2,
|
||||
pp_base: int = 1,
|
||||
multi_node_only: bool = False,
|
||||
task: TaskOption = "auto",
|
||||
load_format: Optional[str] = None,
|
||||
@ -69,42 +67,18 @@ class SPTestSettings:
|
||||
return SPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=False,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=False,
|
||||
chunked_prefill=True),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=True,
|
||||
chunked_prefill=True),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=2 * pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=False,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=2 * pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=False,
|
||||
chunked_prefill=True),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=2 * pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=2 * pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=True,
|
||||
chunked_prefill=True)
|
||||
@ -120,7 +94,6 @@ class SPTestSettings:
|
||||
def fast(
|
||||
*,
|
||||
tp_base: int = 2,
|
||||
pp_base: int = 1,
|
||||
task: TaskOption = "auto",
|
||||
multi_node_only: bool = False,
|
||||
load_format: Optional[str] = None,
|
||||
@ -128,12 +101,6 @@ class SPTestSettings:
|
||||
return SPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=False,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=2 * pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=False,
|
||||
chunked_prefill=False),
|
||||
@ -169,7 +136,6 @@ def _compare_sp(
|
||||
):
|
||||
(
|
||||
tp_size,
|
||||
pp_size,
|
||||
sp_enabled,
|
||||
eager_mode,
|
||||
chunked_prefill,
|
||||
@ -201,6 +167,7 @@ def _compare_sp(
|
||||
else:
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
|
||||
pp_size = 1
|
||||
if num_gpus_available < tp_size * pp_size:
|
||||
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
|
||||
if VLLM_MULTI_NODE and distributed_backend == "mp":
|
||||
@ -289,7 +256,7 @@ def _compare_sp(
|
||||
|
||||
SP_TEXT_GENERATION_MODELS = {
|
||||
# [Decoder-only]
|
||||
"meta-llama/Llama-3.2-1B-Instruct": SPTestSettings.fast(),
|
||||
"meta-llama/Llama-3.2-1B-Instruct": SPTestSettings.detailed(),
|
||||
}
|
||||
|
||||
SP_TEST_MODELS = [
|
||||
|
||||
@ -4287,6 +4287,18 @@ class VllmConfig:
|
||||
self.compilation_config.level = CompilationLevel.PIECEWISE
|
||||
self.compilation_config.set_splitting_ops_for_v1()
|
||||
|
||||
if self.parallel_config is not None and \
|
||||
self.parallel_config.tensor_parallel_size > 1 and \
|
||||
self.parallel_config.pipeline_parallel_size > 1 and \
|
||||
self.compilation_config is not None and \
|
||||
self.compilation_config.pass_config is not None and \
|
||||
self.compilation_config.pass_config.enable_sequence_parallelism:
|
||||
logger.warning_once(
|
||||
"Sequence parallelism is not supported with pipeline "
|
||||
"parallelism. Disabling sequence parallelism.")
|
||||
self.compilation_config.pass_config.\
|
||||
enable_sequence_parallelism = False
|
||||
|
||||
self._set_cudagraph_sizes()
|
||||
|
||||
if self.cache_config is not None and \
|
||||
|
||||
@ -1056,40 +1056,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
indices=out_indices,
|
||||
)
|
||||
|
||||
def sync_and_slice_intermediate_tensors(
|
||||
self, num_tokens: int, intermediate_tensors: IntermediateTensors,
|
||||
sync_self: bool) -> IntermediateTensors:
|
||||
|
||||
assert self.intermediate_tensors is not None
|
||||
|
||||
tp = self.vllm_config.parallel_config.tensor_parallel_size
|
||||
enabled_sp = self.vllm_config.compilation_config.pass_config. \
|
||||
enable_sequence_parallelism
|
||||
if enabled_sp:
|
||||
# When sequence parallelism is enabled, we always pad num_tokens
|
||||
# to be a multiple of tensor_parallel_size (tp) earlier
|
||||
assert num_tokens % tp == 0
|
||||
is_residual_scattered = tp > 1 and enabled_sp \
|
||||
and num_tokens % tp == 0
|
||||
|
||||
# When sequence parallelism is enabled, the "residual" tensor is sharded
|
||||
# across tensor parallel ranks, so each rank only needs its own slice.
|
||||
if sync_self:
|
||||
assert intermediate_tensors is not None
|
||||
for k, v in intermediate_tensors.items():
|
||||
is_scattered = "residual" and is_residual_scattered
|
||||
copy_len = num_tokens // tp if is_scattered else \
|
||||
num_tokens
|
||||
self.intermediate_tensors[k][:copy_len].copy_(
|
||||
v[:copy_len], non_blocking=True)
|
||||
|
||||
return IntermediateTensors({
|
||||
k:
|
||||
v[:num_tokens // tp]
|
||||
if k == "residual" and is_residual_scattered else v[:num_tokens]
|
||||
for k, v in self.intermediate_tensors.items()
|
||||
})
|
||||
|
||||
@torch.inference_mode()
|
||||
def execute_model(
|
||||
self,
|
||||
@ -1165,8 +1131,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
if get_pp_group().is_first_rank:
|
||||
intermediate_tensors = None
|
||||
else:
|
||||
intermediate_tensors = self.sync_and_slice_intermediate_tensors(
|
||||
num_input_tokens, intermediate_tensors, True)
|
||||
assert intermediate_tensors is not None
|
||||
assert self.intermediate_tensors is not None
|
||||
for k, v in intermediate_tensors.items():
|
||||
self.intermediate_tensors[k][:num_input_tokens].copy_(
|
||||
v[:num_input_tokens], non_blocking=True)
|
||||
intermediate_tensors = IntermediateTensors({
|
||||
k: v[:num_input_tokens]
|
||||
for k, v in self.intermediate_tensors.items()
|
||||
})
|
||||
|
||||
# Run the decoder.
|
||||
# Use persistent buffers for CUDA graphs.
|
||||
@ -1685,9 +1658,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
batch_size=self.max_num_tokens,
|
||||
dtype=self.model_config.dtype,
|
||||
device=self.device))
|
||||
|
||||
intermediate_tensors = self.sync_and_slice_intermediate_tensors(
|
||||
num_tokens, None, False)
|
||||
intermediate_tensors = IntermediateTensors({
|
||||
k: v[:num_tokens]
|
||||
for k, v in self.intermediate_tensors.items()
|
||||
})
|
||||
|
||||
with set_forward_context(attn_metadata,
|
||||
self.vllm_config,
|
||||
|
||||
Reference in New Issue
Block a user