From 469b3ffaaadb6ab15ddbebc47ac11a0c6fddfda2 Mon Sep 17 00:00:00 2001 From: Giancarlo Delfin <32987265+TheEpicDolphin@users.noreply.github.com> Date: Tue, 5 Aug 2025 10:04:46 -0700 Subject: [PATCH 001/932] [V1] port xformers backend to v1 (#21342) Signed-off-by: Giancarlo Delfin --- tests/v1/attention/utils.py | 2 + vllm/engine/arg_utils.py | 1 + vllm/platforms/cuda.py | 4 + vllm/platforms/interface.py | 1 + vllm/v1/attention/backends/tree_attn.py | 1 - vllm/v1/attention/backends/xformers.py | 430 ++++++++++++++++++++++++ 6 files changed, 438 insertions(+), 1 deletion(-) create mode 100644 vllm/v1/attention/backends/xformers.py diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index 78a6509986..e9e574501d 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -128,6 +128,8 @@ def get_attention_backend(backend_name: _Backend): "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend", _Backend.TREE_ATTN: "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend", + _Backend.XFORMERS_VLLM_V1: + "vllm.v1.attention.backends.xformers.XFormersAttentionBackend", } if backend_name not in backend_map: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 5eb9660cd1..3e2f03d56c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1469,6 +1469,7 @@ class EngineArgs: "TORCH_SDPA_VLLM_V1", "FLEX_ATTENTION", "TREE_ATTN", + "XFORMERS_VLLM_V1", ] if (envs.is_set("VLLM_ATTENTION_BACKEND") and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS): diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index b61b39a927..dd9356e399 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -271,6 +271,7 @@ class CudaPlatformBase(Platform): TRITON_ATTN_VLLM_V1 = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend" # noqa: E501 FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" # noqa: E501 TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend" # noqa: E501 + XFORMERS_V1 = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend" # noqa: E501 if selected_backend == _Backend.FLASHINFER: logger.info_once("Using FlashInfer backend on V1 engine.") @@ -291,6 +292,9 @@ class CudaPlatformBase(Platform): elif selected_backend == _Backend.TREE_ATTN: logger.info_once("Using Tree Attention backend on V1 engine.") return TREE_ATTN_V1 + elif selected_backend == _Backend.XFORMERS_VLLM_V1: + logger.info_once("Using XFormers backend on V1 engine.") + return XFORMERS_V1 from vllm.attention.selector import is_attn_backend_supported diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 61ce868c13..a85b583abc 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -63,6 +63,7 @@ class _Backend(enum.Enum): NO_ATTENTION = enum.auto() FLEX_ATTENTION = enum.auto() TREE_ATTN = enum.auto() + XFORMERS_VLLM_V1 = enum.auto() class PlatformEnum(enum.Enum): diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py index a071f0921d..3b53b039f1 100644 --- a/vllm/v1/attention/backends/tree_attn.py +++ b/vllm/v1/attention/backends/tree_attn.py @@ -316,7 +316,6 @@ class TreeAttentionImpl(AttentionImpl): logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, ) -> None: self.num_heads = num_heads self.head_size = head_size diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py new file mode 100644 index 0000000000..fe732c6017 --- /dev/null +++ b/vllm/v1/attention/backends/xformers.py @@ -0,0 +1,430 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Attention layer with XFormersAttention.""" + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Optional + +import torch + +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionMetadata, AttentionType) +from vllm.attention.ops.triton_unified_attention import unified_attention +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.v1.attention.backends.utils import ( + AttentionMetadataBuilder, CommonAttentionMetadata, + reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills) +from vllm.v1.kv_cache_interface import AttentionSpec + +try: + from xformers import ops as xops + from xformers.ops.fmha.attn_bias import ( + AttentionBias, PagedBlockDiagonalCausalWithOffsetPaddedKeysMask) + + XFORMERS_AVAILABLE = True +except ImportError: + XFORMERS_AVAILABLE = False + +if TYPE_CHECKING: + from vllm.v1.core.sched.output import SchedulerOutput + from vllm.v1.worker.gpu_input_batch import InputBatch + +from vllm import _custom_ops as ops + +logger = init_logger(__name__) + + +class XFormersAttentionBackend(AttentionBackend): + + accept_output_buffer: bool = True + + @classmethod + def get_supported_dtypes(cls) -> list[torch.dtype]: + return [torch.float16, torch.bfloat16] + + @classmethod + def get_supported_head_sizes(cls) -> list[int]: + return [ + 32, + 40, + 48, + 56, + 64, + 72, + 80, + 88, + 96, + 104, + 112, + 120, + 128, + 136, + 144, + 152, + 160, + 168, + 176, + 184, + 192, + 200, + 208, + 216, + 224, + 232, + 240, + 248, + 256, + ] + + @classmethod + def validate_head_size(cls, head_size: int) -> None: + supported_head_sizes = cls.get_supported_head_sizes() + if head_size not in supported_head_sizes: + attn_type = cls.__name__.removesuffix("Backend") + raise ValueError( + f"Head size {head_size} is not supported by {attn_type}. " + f"Supported head sizes are: {supported_head_sizes}. " + "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use " + "FlexAttention backend which supports all head sizes.") + + @staticmethod + def get_name() -> str: + return "XFORMERS_VLLM_V1" + + @staticmethod + def get_impl_cls() -> type["XFormersAttentionImpl"]: + return XFormersAttentionImpl + + @staticmethod + def get_metadata_cls() -> type["AttentionMetadata"]: + return XFormersAttentionMetadata + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> tuple[int, ...]: + if block_size % 16 != 0: + raise ValueError("Block size must be a multiple of 16.") + return (2, num_blocks, block_size, num_kv_heads, head_size) + + @staticmethod + def get_builder_cls() -> type["XFormersAttentionMetadataBuilder"]: + return XFormersAttentionMetadataBuilder + + @staticmethod + def use_cascade_attention(*args, **kwargs) -> bool: + return False + + +@dataclass +class XFormersAttentionMetadata: + num_actual_tokens: int # Number of tokens excluding padding. + max_query_len: int + query_start_loc: torch.Tensor + max_seq_len: int + seq_lens: torch.Tensor + block_table: torch.Tensor + slot_mapping: torch.Tensor + + num_prefill_tokens: int = 0 + num_decode_tokens: int = 0 + num_prefills: int = 0 + num_decodes: int = 0 + + # Biases for different attention types. + attn_bias: Optional["AttentionBias"] = None + + # Self-attention prefill/decode metadata cache + _cached_prefill_metadata: Optional["XFormersAttentionMetadata"] = None + _cached_decode_metadata: Optional["XFormersAttentionMetadata"] = None + + @property + def prefill_metadata(self) -> Optional["XFormersAttentionMetadata"]: + if self.num_prefills == 0: + return None + + if self._cached_prefill_metadata is not None: + # Recover cached prefill-phase attention + # metadata structure + return self._cached_prefill_metadata + + q_start_loc = self.query_start_loc[self.num_decodes:] + q_seqlens = torch.diff(q_start_loc) + kv_seqlens = self.seq_lens[self.num_decodes:] + # Construct & cache prefill-phase attention metadata structure + self._cached_prefill_metadata = XFormersAttentionMetadata( + num_actual_tokens=self.num_prefill_tokens, + max_query_len=int(q_seqlens.max().item()), + query_start_loc=q_start_loc - q_start_loc[0], + max_seq_len=int(kv_seqlens.max().item()), + seq_lens=kv_seqlens, + block_table=self.block_table[self.num_decodes:], + slot_mapping=self.slot_mapping[self.num_decode_tokens:], + ) + return self._cached_prefill_metadata + + @property + def decode_metadata(self) -> Optional["XFormersAttentionMetadata"]: + if self.num_decode_tokens == 0: + return None + + if self._cached_decode_metadata is not None: + # Recover cached decode-phase attention + # metadata structure + return self._cached_decode_metadata + + q_start_loc = self.query_start_loc + q_seqlens = torch.diff(q_start_loc) + decode_kv_seqlens = self.seq_lens[:self.num_decodes] + # Construct & cache decode-phase attention metadata structure + self._cached_decode_metadata = XFormersAttentionMetadata( + num_actual_tokens=self.num_decode_tokens, + max_query_len=int(q_seqlens[:self.num_decodes].max().item()), + query_start_loc=q_start_loc[:self.num_decodes + 1], + max_seq_len=int(decode_kv_seqlens.max().item()), + seq_lens=decode_kv_seqlens, + block_table=self.block_table[:self.num_decodes], + slot_mapping=self.slot_mapping[:self.num_decode_tokens], + attn_bias=self.attn_bias, + ) + return self._cached_decode_metadata + + +class XFormersAttentionMetadataBuilder( + AttentionMetadataBuilder[XFormersAttentionMetadata]): + + def __init__( + self, + kv_cache_spec: AttentionSpec, + layer_names: list[str], + vllm_config: VllmConfig, + device: torch.device, + ): + assert XFORMERS_AVAILABLE + self.kv_cache_spec = kv_cache_spec + self.block_size = kv_cache_spec.block_size + self._num_decodes = 0 + self._num_decode_tokens = 0 + + def reorder_batch(self, input_batch: "InputBatch", + scheduler_output: "SchedulerOutput") -> bool: + return reorder_batch_to_split_decodes_and_prefills(input_batch, + scheduler_output, + decode_threshold=1) + + def build( + self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False, + ) -> XFormersAttentionMetadata: + num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( + split_decodes_and_prefills(common_attn_metadata, + decode_threshold=1)) + + num_actual_tokens = common_attn_metadata.num_actual_tokens + q_start_loc = common_attn_metadata.query_start_loc + q_seqlens = torch.diff(q_start_loc) + max_query_len = common_attn_metadata.max_query_len + kv_seqlens = common_attn_metadata.seq_lens + max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + block_table = common_attn_metadata.block_table_tensor + slot_mapping = common_attn_metadata.slot_mapping + + bias = None + if num_decodes > 0: + # Construct the decoder bias. + decode_q_seqlens = q_seqlens[:num_decodes] + decode_kv_seqlens = kv_seqlens[:num_decodes] + bias = ( + PagedBlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens( + q_seqlen=decode_q_seqlens.tolist(), + kv_seqlen=decode_kv_seqlens.tolist(), + page_size=self.block_size, + block_tables=block_table[:num_decodes], + device=block_table.device, + )) + + return XFormersAttentionMetadata( + num_actual_tokens=num_actual_tokens, + num_prefill_tokens=num_prefill_tokens, + num_decode_tokens=num_decode_tokens, + num_prefills=num_prefills, + num_decodes=num_decodes, + max_query_len=max_query_len, + query_start_loc=q_start_loc, + max_seq_len=max_seq_len, + seq_lens=kv_seqlens, + block_table=block_table, + slot_mapping=slot_mapping, + attn_bias=bias, + ) + + +class XFormersAttentionImpl(AttentionImpl): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[list[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + logits_soft_cap: Optional[float] = None, + attn_type: AttentionType = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, + ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") + if alibi_slopes is not None: + raise NotImplementedError( + "XFormers does not support alibi slopes yet.") + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_kv_heads + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + self.kv_cache_dtype = kv_cache_dtype + self.kv_sharing_target_layer_name = kv_sharing_target_layer_name + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) + self.alibi_slopes = alibi_slopes + if sliding_window is None: + self.sliding_window = (-1, -1) + else: + self.sliding_window = (sliding_window - 1, 0) + if logits_soft_cap is None: + # Setting logits_soft_cap to 0 means no soft cap. + logits_soft_cap = 0 + self.logits_soft_cap = logits_soft_cap + + XFormersAttentionBackend.validate_head_size(head_size) + + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "XFormersAttentionImpl.") + + def forward( + self, + layer: torch.nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: XFormersAttentionMetadata, + output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Forward pass with XFormers. + + Args: + query: shape = [num_tokens, num_heads, head_size] + key: shape = [num_tokens, num_kv_heads, head_size] + value: shape = [num_tokens, num_kv_heads, head_size] + kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + attn_metadata: Metadata for attention. + Returns: + shape = [num_tokens, num_heads * head_size] + """ + assert output is not None, "Output tensor must be provided." + + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for XFormersAttentionImpl") + + if attn_metadata is None: + # Profiling run. + return output + + # Cache the input KVs. + key_cache, value_cache = kv_cache.unbind(0) + if self.kv_sharing_target_layer_name is None: + # Reshape the input keys and values and store them in the cache. + # Skip this if sharing KV cache with an earlier attention layer. + # NOTE(woosuk): Here, key and value are padded while slot_mapping is + # not padded. However, we don't need to do key[:num_actual_tokens] + # and value[:num_actual_tokens] because the reshape_and_cache_flash + # op uses the slot_mapping's shape to determine the number of + # actual tokens. + ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) + + num_actual_tokens = attn_metadata.num_actual_tokens + num_decode_tokens = attn_metadata.num_decode_tokens + if prefill_meta := attn_metadata.prefill_metadata: + descale_shape = (prefill_meta.query_start_loc.shape[0] - 1, + key.shape[1]) + unified_attention( + q=query[num_decode_tokens:num_actual_tokens], + k=key_cache, + v=value_cache, + out=output[num_decode_tokens:num_actual_tokens], + cu_seqlens_q=prefill_meta.query_start_loc, + max_seqlen_q=prefill_meta.max_query_len, + seqused_k=prefill_meta.seq_lens, + max_seqlen_k=prefill_meta.max_seq_len, + softmax_scale=self.scale, + causal=True, + alibi_slopes=self.alibi_slopes, + window_size=self.sliding_window, + block_table=prefill_meta.block_table, + softcap=self.logits_soft_cap, + q_descale=None, # Not supported + k_descale=layer._k_scale.expand(descale_shape), + v_descale=layer._v_scale.expand(descale_shape), + ) + + if decode_meta := attn_metadata.decode_metadata: + # Query for decode. KV is not needed because it is already cached. + decode_query = query[:num_decode_tokens] + # Reshape query to [1, B_T, G, H, D]. + q = decode_query.view(1, -1, self.num_kv_heads, + self.num_queries_per_kv, self.head_size) + # Reshape the k and v caches to [1, Bkv_T, G, H, D] + cache_k = key_cache.view(1, -1, self.num_kv_heads, 1, + self.head_size).expand( + 1, + -1, + self.num_kv_heads, + self.num_queries_per_kv, + self.head_size, + ) + cache_v = value_cache.view(1, -1, self.num_kv_heads, 1, + self.head_size).expand( + 1, + -1, + self.num_kv_heads, + self.num_queries_per_kv, + self.head_size, + ) + + attn_bias = decode_meta.attn_bias + output[: + num_decode_tokens] = xops.memory_efficient_attention_forward( + q, + cache_k, + cache_v, + attn_bias=attn_bias, + p=0.0, + scale=self.scale, + ).view(decode_query.shape) + + # Reshape the output tensor. + return output From 59a0b8554bf0e8a9902e14e3d0e564fea38157b6 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 6 Aug 2025 01:26:09 +0800 Subject: [PATCH 002/932] [bugfix] fix blackwell deepep installation (#22255) --- tools/ep_kernels/README.md | 10 +++++----- tools/ep_kernels/install_python_libraries.sh | 8 +++++++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tools/ep_kernels/README.md b/tools/ep_kernels/README.md index 273e0f378e..85e9d2a4f8 100644 --- a/tools/ep_kernels/README.md +++ b/tools/ep_kernels/README.md @@ -13,16 +13,16 @@ All scripts accept a positional argument as workspace path for staging the build ## Usage -### Single-node - ```bash -bash install_python_libraries.sh +# for hopper +TORCH_CUDA_ARCH_LIST="9.0" bash install_python_libraries.sh +# for blackwell +TORCH_CUDA_ARCH_LIST="10.0" bash install_python_libraries.sh ``` -### Multi-node +Additional step for multi-node deployment: ```bash -bash install_python_libraries.sh sudo bash configure_system_drivers.sh sudo reboot # Reboot is required to load the new driver ``` diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh index 9d1b2da3b4..e163c83e8b 100644 --- a/tools/ep_kernels/install_python_libraries.sh +++ b/tools/ep_kernels/install_python_libraries.sh @@ -29,6 +29,12 @@ if [ -z "$CUDA_HOME" ]; then exit 1 fi +# assume TORCH_CUDA_ARCH_LIST is set correctly +if [ -z "$TORCH_CUDA_ARCH_LIST" ]; then + echo "TORCH_CUDA_ARCH_LIST is not set, please set it to your desired architecture." + exit 1 +fi + # disable all features except IBGDA export NVSHMEM_IBGDA_SUPPORT=1 @@ -95,7 +101,7 @@ clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" cd pplx-kernels # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925 # PIP_NO_BUILD_ISOLATION=0 disables build isolation -PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install -vvv -e . +PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e . popd # build and install deepep, require pytorch installed From 4b29d2784b3753fd5434cded25cbcf0bce7b7da7 Mon Sep 17 00:00:00 2001 From: Siyuan Liu Date: Tue, 5 Aug 2025 16:54:56 -0700 Subject: [PATCH 003/932] [CI][TPU] Fix docker clean up (#22271) Signed-off-by: Siyuan Liu --- .buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh | 3 +-- .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh | 1 - .buildkite/scripts/tpu/config_v6e_1.env | 2 +- .buildkite/scripts/tpu/docker_run_bm.sh | 2 -- .buildkite/scripts/tpu/quantized_v6e_1.env | 2 +- 5 files changed, 3 insertions(+), 7 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh index d998c1f73b..734a817fd1 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -4,8 +4,7 @@ set -xu remove_docker_container() { - docker rm -f tpu-test || true; - docker rm -f vllm-tpu || true; + docker rm -f tpu-test || true; } trap remove_docker_container EXIT diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index e565d4b246..9e7b5a5462 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -5,7 +5,6 @@ set -xu remove_docker_container() { docker rm -f tpu-test || true; - docker rm -f vllm-tpu || true; } trap remove_docker_container EXIT diff --git a/.buildkite/scripts/tpu/config_v6e_1.env b/.buildkite/scripts/tpu/config_v6e_1.env index 03ec116f69..c9e3c26571 100644 --- a/.buildkite/scripts/tpu/config_v6e_1.env +++ b/.buildkite/scripts/tpu/config_v6e_1.env @@ -1,6 +1,6 @@ # Environment config TEST_NAME=llama8b -CONTAINER_NAME=vllm-tpu +CONTAINER_NAME=tpu-test # vllm config MODEL=meta-llama/Llama-3.1-8B-Instruct diff --git a/.buildkite/scripts/tpu/docker_run_bm.sh b/.buildkite/scripts/tpu/docker_run_bm.sh index 8959877a3c..08e3661180 100755 --- a/.buildkite/scripts/tpu/docker_run_bm.sh +++ b/.buildkite/scripts/tpu/docker_run_bm.sh @@ -12,8 +12,6 @@ source /etc/environment source $ENV_FILE remove_docker_container() { - docker rm -f tpu-test || true; - docker rm -f vllm-tpu || true; docker rm -f $CONTAINER_NAME || true; } diff --git a/.buildkite/scripts/tpu/quantized_v6e_1.env b/.buildkite/scripts/tpu/quantized_v6e_1.env index bab34b3be3..bd25c80308 100644 --- a/.buildkite/scripts/tpu/quantized_v6e_1.env +++ b/.buildkite/scripts/tpu/quantized_v6e_1.env @@ -1,6 +1,6 @@ # Environment config TEST_NAME=llama8bw8a8 -CONTAINER_NAME=vllm-tpu +CONTAINER_NAME=tpu-test # vllm config MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 From 35509fc5be5d840e84717ff24bba6bdd5cc33d77 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 5 Aug 2025 20:05:40 -0400 Subject: [PATCH 004/932] [Bugfix] Remove faulty test for oot attention backend (#22286) Signed-off-by: mgoin --- tests/plugins_tests/test_platform_plugins.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index ef99c3dadd..1d7e447501 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -4,9 +4,7 @@ import pytest import torch -from vllm.attention.selector import get_attn_backend from vllm.plugins import load_general_plugins -from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL def test_platform_plugins(): @@ -27,14 +25,6 @@ def test_platform_plugins(): f" is loaded. The first import:\n{_init_trace}") -def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch): - # ignore the backend env variable if it is set - with monkeypatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL) - backend = get_attn_backend(16, torch.float16, "auto", 16, False) - assert backend.get_name() == "Dummy_Backend" - - def test_oot_custom_op(monkeypatch: pytest.MonkeyPatch): # simulate workload by running an example load_general_plugins() From 6a5153043799dde3e22fae11f17c423c765f747b Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 5 Aug 2025 22:35:20 -0400 Subject: [PATCH 005/932] [Bugfix] Fix 3D input passed into cutlass_scaled_mm (#22278) Signed-off-by: mgoin --- vllm/_custom_ops.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index e6f69e2344..92de394180 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -710,23 +710,25 @@ def cutlass_scaled_mm(a: torch.Tensor, scale_b.shape * [128, 128] == b.shape """ assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16) - assert bias is None or bias.shape[0] == b.shape[ - 1] and bias.dtype == out_dtype + assert bias is None or bias.numel( + ) == b.shape[1] and bias.dtype == out_dtype - m = a.shape[0] - n = b.shape[1] + # Massage the input to be 2D + target_shape = (*a.shape[:-1], b.shape[1]) + a = a.view(-1, a.shape[-1]) cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0) if current_platform.is_rocm() or not cutlass_compatible_b: from vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm import ( # noqa triton_scaled_mm) - return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) + out = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) + else: + out = torch.empty((a.shape[0], b.shape[1]), + dtype=out_dtype, + device=a.device) + torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias) - out = torch.empty((m, n), dtype=out_dtype, device=a.device) - - torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias) - - return out + return out.view(*target_shape) def cutlass_scaled_mm_azp(a: torch.Tensor, @@ -746,15 +748,18 @@ def cutlass_scaled_mm_azp(a: torch.Tensor, assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16) assert bias is None or bias.numel( ) == b.shape[1] and bias.dtype == out_dtype + + # Massage the input to be 2D + target_shape = (*a.shape[:-1], b.shape[1]) + a = a.view(-1, a.shape[-1]) assert azp is None or azp.numel() == a.shape[0] - m = a.shape[0] - n = b.shape[1] - out = torch.empty((m, n), dtype=out_dtype, device=a.device) - + out = torch.empty((a.shape[0], b.shape[1]), + dtype=out_dtype, + device=a.device) torch.ops._C.cutlass_scaled_mm_azp(out, a, b, scale_a, scale_b, azp_adj, azp, bias) - return out + return out.view(*target_shape) def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool: From 8e6c7e873f1a2830ab096d69ee1812b323aef650 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 6 Aug 2025 10:56:22 +0800 Subject: [PATCH 006/932] [Bugfix] Fix MoE BNB version (#22260) Signed-off-by: Jee Jee Li --- vllm/model_executor/layers/quantization/bitsandbytes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index a96f3ee5c3..5359189caa 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -412,12 +412,12 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): def __init__(self, quant_config: BitsAndBytesConfig): try: import bitsandbytes - if bitsandbytes.__version__ < "0.45.3": + if bitsandbytes.__version__ < "0.46.1": raise ImportError("bitsandbytes version is wrong. Please " - "install bitsandbytes>=0.45.3.") + "install bitsandbytes>=0.46.1.") except ImportError as err: - raise ImportError("Please install bitsandbytes>=0.45.3 via " - "`pip install bitsandbytes>=0.45.3` to use " + raise ImportError("Please install bitsandbytes>=0.46.1 via " + "`pip install bitsandbytes>=0.46.1` to use " "bitsandbytes quantizer.") from err self.topk_indices_dtype = None self.quant_config = quant_config From 7e6544c7978364fcb8178f4ab8b1325e45880aa9 Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Tue, 5 Aug 2025 22:57:49 -0400 Subject: [PATCH 007/932] [Perf] Parallelize fill_bitmask to accelerate high-throughput guided decoding (#21862) Signed-off-by: Benjamin Chislett --- vllm/v1/structured_output/__init__.py | 125 +++++++++++++----- vllm/v1/structured_output/backend_xgrammar.py | 7 +- vllm/v1/worker/gpu_model_runner.py | 9 +- 3 files changed, 102 insertions(+), 39 deletions(-) diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index bd1dd01f90..63604a335d 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -3,7 +3,7 @@ from __future__ import annotations import multiprocessing -from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import Future, ThreadPoolExecutor from typing import TYPE_CHECKING, Optional from vllm.config import VllmConfig @@ -40,6 +40,17 @@ class StructuredOutputManager: self._grammar_bitmask: Optional[torch.Tensor] = None self._full_mask = torch.tensor(-1, dtype=torch.int32) + max_batch_size = self.vllm_config.scheduler_config.max_num_seqs + self.fill_bitmask_parallel_threshold = 128 + if self.fill_bitmask_parallel_threshold < max_batch_size: + self.fill_bitmask_parallel_batch_size = 16 + # Use: + # - at least 1 CPU + # - at most half the number of CPUs or 8, whichever is less + max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8)) + self.executor_for_fillmask = ThreadPoolExecutor( + max_workers=max_workers) + if not self.vllm_config.model_config.skip_tokenizer_init: # The default max_workers if not specified is the number of # CPUs * 5, which is way too high since these tasks are CPU-bound, @@ -120,6 +131,26 @@ class StructuredOutputManager: assert self.backend is not None return self.backend.compile_grammar(request_type, grammar_spec) + def _fill_bitmasks( + self, + batch: list[tuple[StructuredOutputGrammar, int, bool]], + ) -> None: + assert self._grammar_bitmask is not None + for grammar, index, apply_bitmask in batch: + if apply_bitmask and not grammar.is_terminated(): + grammar.fill_bitmask(self._grammar_bitmask, index) + else: + # Note that for thinking support, we will need to + # reset the relevant part of the bitmask for consequent + # requests here. + self._grammar_bitmask[index].fill_(self._full_mask) + + def _async_submit_fill_bitmask( + self, + batch: list[tuple[StructuredOutputGrammar, int, bool]], + ) -> Future: + return self.executor_for_fillmask.submit(self._fill_bitmasks, batch) + def grammar_bitmask( self, requests: dict[str, Request], @@ -146,7 +177,6 @@ class StructuredOutputManager: self.backend.allocate_token_bitmask( max_batch_size * (1 + max_num_spec_tokens)) - bitmask_tensor = self._grammar_bitmask # Generate a batched bitmask for all structured output requests. # When speculative decoding is enabled, we need to include multiple # masks for each request, one for each possible bonus token position. @@ -155,47 +185,61 @@ class StructuredOutputManager: ordered_seq = sorted(structured_output_request_ids.items(), key=lambda x: x[1]) - # Note that for thinking support, we will need to - # reset the relevant part of the bitmask for consequent - # request here. - bitmask_tensor[:(len(ordered_seq) * (1 + max_num_spec_tokens))].fill_( - self._full_mask) + # Optimized parallel filling of bitmasks for + # non-spec, large-batch-size cases + if len(ordered_seq) > self.fill_bitmask_parallel_threshold and \ + max_num_spec_tokens == 0: + promises = [] + batch = [] + for req_id, _ in ordered_seq: + request = requests[req_id] + structured_output_request = request.structured_output_request + if TYPE_CHECKING: + assert structured_output_request is not None + assert structured_output_request.grammar is not None - # NOTE: This outer loop can likely be parallelized to improve - # performance of bitmask generation for large batches. - for req_id, _ in ordered_seq: - request = requests[req_id] - structured_output_request = request.structured_output_request + apply_bitmask = self.should_fill_bitmask(request) + batch.append((structured_output_request.grammar, + cumulative_index, apply_bitmask)) + if len(batch) == self.fill_bitmask_parallel_batch_size: + promises.append(self._async_submit_fill_bitmask(batch)) + batch = [] - if TYPE_CHECKING: - assert structured_output_request is not None - assert structured_output_request.grammar is not None - apply_bitmask: bool = True - if self.reasoner is not None: - if structured_output_request.reasoning_ended is None: - structured_output_request.reasoning_ended = \ - self.reasoner.is_reasoning_end(request.prompt_token_ids) - apply_bitmask = structured_output_request.reasoning_ended + cumulative_index += 1 + if batch: + promises.append(self._async_submit_fill_bitmask(batch)) - state_advancements = 0 - req_tokens = scheduled_spec_decode_tokens.get(req_id, []) + [None] - for i, token in enumerate(req_tokens): - if apply_bitmask and not \ - structured_output_request.grammar.is_terminated(): - structured_output_request.grammar.fill_bitmask( - bitmask_tensor, cumulative_index) - if token is not None: - # In order to generate the correct bitmask for each - # position in the speculative sequence, we advance - # the FSM state for each speculative token and rollback - # to restore the previous state when we are finished. + # Wait for all bitmask filling tasks to complete. + for promise in promises: + promise.result() + else: + # Fallback to serial filling of bitmasks for small-batch-size cases + for req_id, _ in ordered_seq: + request = requests[req_id] + structured_output_request = request.structured_output_request + + if TYPE_CHECKING: + assert structured_output_request is not None + assert structured_output_request.grammar is not None + apply_bitmask = self.should_fill_bitmask(request) + + state_advancements = 0 + req_tokens = scheduled_spec_decode_tokens.get(req_id, []) + for i, token in enumerate(req_tokens + [None]): + self._fill_bitmasks([(structured_output_request.grammar, + cumulative_index, apply_bitmask)]) + + if apply_bitmask and token is not None and \ + not structured_output_request.grammar.is_terminated(): assert structured_output_request.grammar.accept_tokens( req_id, [token]) state_advancements += 1 - cumulative_index += 1 - if state_advancements > 0: - structured_output_request.grammar.rollback(state_advancements) + cumulative_index += 1 + if state_advancements > 0: + structured_output_request.grammar.rollback( + state_advancements) + bitmask_tensor = self._grammar_bitmask if cumulative_index < bitmask_tensor.shape[0]: bitmask_tensor = bitmask_tensor[:cumulative_index] @@ -204,6 +248,15 @@ class StructuredOutputManager: # and deserialization when sending this to the GPU workers. return bitmask_tensor.numpy() + def should_fill_bitmask(self, request: Request) -> bool: + if self.reasoner is not None: + assert request.structured_output_request is not None + if request.structured_output_request.reasoning_ended is None: + request.structured_output_request.reasoning_ended = \ + self.reasoner.is_reasoning_end(request.prompt_token_ids) + return request.structured_output_request.reasoning_ended + return True + def should_advance(self, request: Request) -> bool: if not request.use_structured_output: return False diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index 88544565e5..5e00f63804 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -148,6 +148,7 @@ class XgrammarGrammar(StructuredOutputGrammar): repr=False, hash=False, init=False) + _is_terminated: bool = field(default=False, repr=False, hash=False) def accept_tokens(self, request_id: str, tokens: list[int]) -> bool: """Accepts a list of tokens and advances the FSM. @@ -155,6 +156,8 @@ class XgrammarGrammar(StructuredOutputGrammar): Returns True if the FSM was advanced successfully. Returns False if the FSM failed to advance. """ + if self._is_terminated: + return False for token in tokens: if not self.matcher.accept_token(token): logger.error( @@ -162,6 +165,7 @@ class XgrammarGrammar(StructuredOutputGrammar): "for tokens %s. Please file an issue.", request_id, token) return False self.num_processed_tokens += 1 + self._is_terminated = self.matcher.is_terminated() return True def validate_tokens(self, tokens: list[int]) -> list[int]: @@ -184,12 +188,13 @@ class XgrammarGrammar(StructuredOutputGrammar): def rollback(self, num_tokens: int) -> None: self.matcher.rollback(num_tokens) self.num_processed_tokens -= num_tokens + self._is_terminated = self.matcher.is_terminated() def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None: self.matcher.fill_next_token_bitmask(bitmask, idx) def is_terminated(self) -> bool: - return self.matcher.is_terminated() + return self._is_terminated def reset(self): self.num_processed_tokens = 0 diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 85976fc1c8..549f21af79 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1324,9 +1324,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): cumulative_index += 1 + num_spec_tokens grammar_bitmask = sorted_bitmask + # If the grammar bitmask and the logits have the same shape + # we don't need to pass indices to the kernel, + # since the bitmask is already aligned with the logits. + skip_out_indices = grammar_bitmask.shape[0] == logits.shape[0] + # Serialization of np.ndarray is much more efficient than a tensor, # so we receive it in that format. - grammar_bitmask = torch.from_numpy(grammar_bitmask) + grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous() # Force use of the torch.compile implementation from xgrammar to work # around issues with the Triton kernel in concurrent structured output @@ -1334,7 +1339,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): xgr_torch_compile.apply_token_bitmask_inplace_torch_compile( logits, grammar_bitmask.to(self.device, non_blocking=True), - indices=out_indices, + indices=out_indices if not skip_out_indices else None, ) def sync_and_slice_intermediate_tensors( From 302962e806e9820643ae25987e8e38ed035e05d3 Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Tue, 5 Aug 2025 20:35:32 -0700 Subject: [PATCH 008/932] [Bugfix] Skip dead and non-GPU nodes for Ray DP engine allocation (#22275) Signed-off-by: Rui Qiao --- vllm/v1/engine/utils.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index f39aa40593..770aa7d9dc 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -297,10 +297,10 @@ class CoreEngineActorManager: local_engine_count = \ vllm_config.parallel_config.data_parallel_size_local - nodes = sorted(list_nodes(), + nodes = sorted(list_nodes(filters=[("state", "=", "ALIVE")]), key=lambda node: node.node_ip != dp_master_ip) assert nodes[0].node_ip == dp_master_ip, ( - "The first node must be the head node") + "The head node is missing or dead") assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, ( "There can only be one head node") @@ -312,6 +312,8 @@ class CoreEngineActorManager: for node in nodes: node_ip = node.node_ip node_resources = available_resources[node.node_id] + if "GPU" not in node_resources: + continue # For now, each DP rank can only be assigned to one node # TODO(rui): support allocating a single DP rank # to multiple nodes @@ -346,6 +348,13 @@ class CoreEngineActorManager: ) placement_groups.append(pg) local_dp_ranks.append(i) + if len(placement_groups) < num_pg_to_create: + raise ValueError( + f"Not enough resources to allocate {num_pg_to_create} " + "placement groups, only created " + f"{len(placement_groups)} placement groups. " + "Available resources: " + f"{available_resources}") return placement_groups, local_dp_ranks @staticmethod From 5d5d419ca6aa55034eef0144f24e66789b486cb5 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Tue, 5 Aug 2025 23:39:32 -0400 Subject: [PATCH 009/932] [Bugfix][CI/Build][ROCm] Make sure to use the headers from the build folder on ROCm (#22264) Signed-off-by: Gregory Shtrasberg --- cmake/utils.cmake | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 621179a701..9c0ed1d095 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -467,6 +467,12 @@ function (define_gpu_extension_target GPU_MOD_NAME) if (GPU_LANGUAGE STREQUAL "HIP") # Make this target dependent on the hipify preprocessor step. add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME}) + # Make sure we include the hipified versions of the headers, and avoid conflicts with the ones in the original source folder + target_include_directories(${GPU_MOD_NAME} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/csrc + ${GPU_INCLUDE_DIRECTORIES}) + else() + target_include_directories(${GPU_MOD_NAME} PRIVATE csrc + ${GPU_INCLUDE_DIRECTORIES}) endif() if (GPU_ARCHITECTURES) @@ -482,8 +488,6 @@ function (define_gpu_extension_target GPU_MOD_NAME) target_compile_definitions(${GPU_MOD_NAME} PRIVATE "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}") - target_include_directories(${GPU_MOD_NAME} PRIVATE csrc - ${GPU_INCLUDE_DIRECTORIES}) target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES}) From e3c876dca357711705822a7539eddca05ee0911f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 5 Aug 2025 21:36:21 -0700 Subject: [PATCH 010/932] Upgrade FA3 for attention sink (#22313) Signed-off-by: Woosuk Kwon --- cmake/external_projects/vllm_flash_attn.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index ef45a5fbeb..4eb4b464a2 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 1c2624e53c078854e0637ee566c72fe2107e75f4 + GIT_TAG b99f8c821771fd11feb66d5c89661e9858fde359 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn From dd16bdc7981349edc44900c1c614e09b2faa712e Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 5 Aug 2025 21:43:21 -0700 Subject: [PATCH 011/932] Increase openai-python version (#22316) Signed-off-by: Woosuk Kwon --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index 6b57a3d2f1..c5eb6dab95 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -13,7 +13,7 @@ tokenizers >= 0.21.1 # Required for fast incremental detokenization. protobuf # Required by LlamaTokenizer. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. aiohttp -openai >= 1.87.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support) +openai >= 1.98.0 # For Responses API with reasoning content pydantic >= 2.10 prometheus_client >= 0.18.0 pillow # Required for image processing From 6e20924350e3fed375bc63d55166a303b6f0828a Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 5 Aug 2025 22:37:21 -0700 Subject: [PATCH 012/932] Add attention sink in attention backends (#22320) Signed-off-by: Woosuk Kwon Co-authored-by: LiuXiaoxuanPKU Co-authored-by: simon-mo Co-authored-by: Chen Zhang Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com> Co-authored-by: Yongye Zhu --- .../ops/chunked_prefill_paged_decode.py | 33 ++++++-- vllm/attention/ops/prefix_prefill.py | 18 ++++- .../attention/ops/triton_unified_attention.py | 30 +++++++- vllm/envs.py | 19 ++++- vllm/v1/attention/backends/flash_attn.py | 10 +++ vllm/v1/attention/backends/triton_attn.py | 75 ++++++++++++++----- vllm/v1/attention/backends/utils.py | 36 ++++++--- 7 files changed, 176 insertions(+), 45 deletions(-) diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py index 4f839348e5..08bfcc974c 100644 --- a/vllm/attention/ops/chunked_prefill_paged_decode.py +++ b/vllm/attention/ops/chunked_prefill_paged_decode.py @@ -28,6 +28,7 @@ def kernel_paged_attention_2d( query_ptr, # [num_tokens, num_query_heads, head_size] key_cache_ptr, # [num_blks, num_kv_heads, head_size // x, blk_size, x] value_cache_ptr, # [num_blks, num_kv_heads, head_size, blk_size] + sink_ptr, # [num_query_heads] block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] seq_lens_ptr, # [num_seqs] alibi_slopes_ptr, # [num_query_heads] @@ -95,7 +96,17 @@ def kernel_paged_attention_2d( block_table_offset = seq_idx * block_table_stride - M = tl.full([num_queries_per_kv_padded], float("-inf"), dtype=tl.float32) + if sink_ptr is None: + M = tl.full([num_queries_per_kv_padded], + float("-inf"), + dtype=tl.float32) + else: + M = tl.load( + sink_ptr + query_head_idx, + mask=head_mask, + other=float("-inf"), + ).to(dtype=tl.float32) + L = tl.full([num_queries_per_kv_padded], 1.0, dtype=tl.float32) acc = tl.zeros([num_queries_per_kv_padded, HEAD_SIZE_PADDED], dtype=tl.float32) @@ -223,6 +234,8 @@ def chunked_prefill_paged_decode( alibi_slopes=None, sliding_window=None, sm_scale=None, + # Optional tensor for sinks + sinks=None, ): if sm_scale is None: @@ -253,6 +266,7 @@ def chunked_prefill_paged_decode( sliding_window=sliding_window, sm_scale=sm_scale, skip_decode=True, + sinks=sinks, ) block_size = value_cache.shape[3] @@ -281,11 +295,17 @@ def chunked_prefill_paged_decode( num_queries_per_kv_padded = max(triton.next_power_of_2(num_queries_per_kv), 16) - use_custom = use_rocm_custom_paged_attention(query.dtype, head_size, - block_size, - num_queries_per_kv, - max_seq_len, sliding_window, - kv_cache_dtype, alibi_slopes) + use_custom = use_rocm_custom_paged_attention( + query.dtype, + head_size, + block_size, + num_queries_per_kv, + max_seq_len, + sliding_window, + kv_cache_dtype, + alibi_slopes, + sinks, + ) if use_custom: _PARTITION_SIZE_ROCM = 256 max_num_partitions = ((max_seq_len + _PARTITION_SIZE_ROCM - 1) // @@ -334,6 +354,7 @@ def chunked_prefill_paged_decode( query_ptr=query, key_cache_ptr=key_cache, value_cache_ptr=value_cache, + sink_ptr=sinks, block_tables_ptr=block_table, seq_lens_ptr=seq_lens, alibi_slopes_ptr=alibi_slopes, diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index 13bef96722..64c9033797 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -38,6 +38,7 @@ def _fwd_kernel(Q, V, K_cache, V_cache, + sink_ptr, B_Loc, sm_scale, k_scale, @@ -126,7 +127,15 @@ def _fwd_kernel(Q, other=0.0) # [M,D] # initialize pointer to m and l - m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + if sink_ptr is None: + m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + else: + m_i = tl.load( + sink_ptr + tl.full([BLOCK_M], cur_head, dtype=tl.int64), + mask=(offs_m < cur_batch_query_len), + other=float("-inf"), + ).to(dtype=tl.float32) + l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32) acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32) # [M,D] @@ -732,7 +741,8 @@ def context_attention_fwd(q, alibi_slopes=None, sliding_window=None, sm_scale=None, - skip_decode=False): + skip_decode=False, + sinks=None): q_dtype_is_f32 = q.dtype is torch.float32 @@ -781,6 +791,7 @@ def context_attention_fwd(q, sliding_window = 0 if alibi_slopes is not None: + assert sinks is None, "Sinks arg is not supported with alibi" # need to reduce num. blocks when using fp32 # due to increased use of GPU shared memory # if q.dtype is torch.float32: @@ -843,7 +854,7 @@ def context_attention_fwd(q, max_seq_len = 0 if max_seq_len is None else max_seq_len extra_kargs = {} if current_platform.is_rocm(): - extra_kargs = {"kpack": 2, "waves_per_eu": 2} + extra_kargs = {"kpack": 1, "waves_per_eu": 2} grid = lambda META: (batch, head, triton.cdiv(max_input_len, META["BLOCK_M"])) @@ -853,6 +864,7 @@ def context_attention_fwd(q, v, k_cache, v_cache, + sinks, b_loc, sm_scale, k_scale, diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index 0fdba569f9..ba4299a277 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -52,6 +52,7 @@ def kernel_unified_attention_2d( query_ptr, # [num_tokens, num_query_heads, head_size] key_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] value_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] + sink_ptr, # [num_query_heads] block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] seq_lens_ptr, # [num_seqs] alibi_slopes_ptr, # [num_query_heads] @@ -131,7 +132,15 @@ def kernel_unified_attention_2d( block_table_offset = seq_idx * block_table_stride - M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + if sink_ptr is None: + M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + else: + M = tl.load( + sink_ptr + query_offset_1, + mask=query_mask_1, + other=float("-inf"), + ).to(dtype=tl.float32) + L = tl.full([BLOCK_M], 1.0, dtype=tl.float32) acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32) @@ -292,6 +301,7 @@ def kernel_unified_attention_3d( query_ptr, # [num_tokens, num_query_heads, head_size] key_cache_ptr, # [num_blks, num_kv_heads, head_size // x, blk_size, x] value_cache_ptr, # [num_blks, num_kv_heads, head_size, blk_size] + sink_ptr, # [num_query_heads] block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] seq_lens_ptr, # [num_seqs] alibi_slopes_ptr, # [num_query_heads] @@ -383,7 +393,15 @@ def kernel_unified_attention_3d( block_table_offset = seq_idx * block_table_stride - M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + if sink_ptr is None or segm_idx != 0: + M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + else: + M = tl.load( + sink_ptr + query_offset_1, + mask=query_mask_1, + other=float("-inf"), + ).to(dtype=tl.float32) + L = tl.full([BLOCK_M], 1.0, dtype=tl.float32) acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32) @@ -627,6 +645,8 @@ def unified_attention( v_descale, alibi_slopes=None, qq_bias=None, + # Optional tensor for sinks + sinks=None, ): assert causal, "Only causal attention is supported" assert q_descale is None, "Q scales not supported" @@ -635,6 +655,10 @@ def unified_attention( assert q.element_size() >= 2 or block_size >= 32, \ "Block size must be at least 32 for fp8" + if sinks is not None: + assert sinks.shape[0] == q.shape[1], \ + "Sinks must be num_query_heads size" + use_alibi_slopes = alibi_slopes is not None use_qq_bias = qq_bias is not None @@ -669,6 +693,7 @@ def unified_attention( query_ptr=q, key_cache_ptr=k, value_cache_ptr=v, + sink_ptr=sinks, block_tables_ptr=block_table, seq_lens_ptr=seqused_k, alibi_slopes_ptr=alibi_slopes, @@ -741,6 +766,7 @@ def unified_attention( query_ptr=q, key_cache_ptr=k, value_cache_ptr=v, + sink_ptr=sinks, block_tables_ptr=block_table, seq_lens_ptr=seqused_k, alibi_slopes_ptr=alibi_slopes, diff --git a/vllm/envs.py b/vllm/envs.py index e28e9658e5..f8a7197dd1 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -17,6 +17,7 @@ if TYPE_CHECKING: LD_LIBRARY_PATH: Optional[str] = None VLLM_USE_TRITON_FLASH_ATTN: bool = True VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False + VLLM_USE_AITER_UNIFIED_ATTENTION: bool = False VLLM_FLASH_ATTN_VERSION: Optional[int] = None LOCAL_RANK: int = 0 CUDA_VISIBLE_DEVICES: Optional[str] = None @@ -151,6 +152,8 @@ if TYPE_CHECKING: VLLM_LOOPBACK_IP: str = "" VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False VLLM_ENABLE_RESPONSES_API_STORE: bool = False + VLLM_USE_TRTLLM_CONTEXT_ATTENTION: bool = False + VLLM_USE_TRTLLM_DECODE_ATTENTION: bool = False def get_default_cache_root(): @@ -326,6 +329,12 @@ environment_variables: dict[str, Callable[[], Any]] = { (os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower() in ("true", "1")), + # Use AITER triton unified attention for V1 attention + "VLLM_USE_AITER_UNIFIED_ATTENTION": + lambda: + (os.getenv("VLLM_USE_AITER_UNIFIED_ATTENTION", "False").lower() in + ("true", "1")), + # Force vllm to use a specific flash-attention version (2 or 3), only valid # when using the flash-attention backend. "VLLM_FLASH_ATTN_VERSION": @@ -1022,9 +1031,13 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_CUDNN_PREFILL": lambda: bool(int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))), - # If set to 1, use the TRTLLM Attention backend in flashinfer. - "VLLM_USE_TRTLLM_ATTENTION": - lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None), + # If set to 1, use the TRTLLM Context Attention backend in flashinfer. + "VLLM_USE_TRTLLM_CONTEXT_ATTENTION": + lambda: bool(int(os.getenv("VLLM_USE_TRTLLM_CONTEXT_ATTENTION", "0"))), + + # If set to 1, use the TRTLLM Decode Attention backend in flashinfer. + "VLLM_USE_TRTLLM_DECODE_ATTENTION": + lambda: bool(int(os.getenv("VLLM_USE_TRTLLM_DECODE_ATTENTION", "0"))), # Controls garbage collection during CUDA graph capture. # If set to 0 (default), enables GC freezing to speed up capture time. diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index f086bab255..95ba56b359 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -373,6 +373,7 @@ class FlashAttentionImpl(AttentionImpl): logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, + sinks: Optional[torch.Tensor] = None, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -410,6 +411,14 @@ class FlashAttentionImpl(AttentionImpl): raise NotImplementedError( "FlashAttention does not support fp8 kv-cache on this device.") + self.sinks = sinks + if self.sinks is not None: + assert self.vllm_flash_attn_version == 3, ( + "Sinks are only supported in FlashAttention 3") + assert self.sinks.shape[0] == num_heads, ( + "Sinks must have the same number of heads as the number of " + "heads in the layer") + def forward( self, layer: torch.nn.Module, @@ -534,6 +543,7 @@ class FlashAttentionImpl(AttentionImpl): k_descale=layer._k_scale.expand(descale_shape), v_descale=layer._v_scale.expand(descale_shape), num_splits=attn_metadata.max_num_splits, + s_aux=self.sinks, ) return output diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 942cb95eef..c33afbfebc 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with PagedAttention and Triton prefix prefill.""" from dataclasses import dataclass +from functools import cache from typing import ClassVar, Optional import torch @@ -13,7 +14,6 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, from vllm.attention.ops.chunked_prefill_paged_decode import ( chunked_prefill_paged_decode) from vllm.attention.ops.paged_attn import PagedAttention -from vllm.attention.ops.triton_unified_attention import unified_attention from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform @@ -193,6 +193,15 @@ class TritonAttentionBackend(AttentionBackend): return TritonAttentionMetadataBuilder +@cache +def use_aiter_unified_attention() -> bool: + """Check if aiter unified attention should be used.""" + # VLLM_ROCM_USE_AITER_MHA needs to set to 0 as well as it is set + # to 1 as default + return envs.VLLM_ROCM_USE_AITER \ + and envs.VLLM_USE_AITER_UNIFIED_ATTENTION + + class TritonAttentionImpl(AttentionImpl): def __init__( @@ -207,6 +216,7 @@ class TritonAttentionImpl(AttentionImpl): logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[int] = None, + sinks: Optional[torch.Tensor] = None, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -240,6 +250,29 @@ class TritonAttentionImpl(AttentionImpl): self.force_prefill_decode_attn = \ envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION + if not self.force_prefill_decode_attn: + # If not using prefill decode attention, we use the Triton + # unified attention implementation. + if use_aiter_unified_attention(): + logger.info_once( + "Using aiter unified attention for TritonAttentionImpl") + from aiter.ops.triton.unified_attention import ( + unified_attention) + self.unified_attention = unified_attention + else: + logger.info_once( + "Using vllm unified attention for TritonAttentionImpl") + from vllm.attention.ops.triton_unified_attention import ( + unified_attention) + self.unified_attention = unified_attention + + self.sinks = sinks + if sinks is not None: + assert sinks.shape[0] == num_heads, ( + "Sinks must have the same number of heads as the number of " + f"heads in the layer. Sinks shape: {sinks.shape}, " + f"num_heads: {num_heads}.") + def forward( self, layer: torch.nn.Module, @@ -342,28 +375,31 @@ class TritonAttentionImpl(AttentionImpl): if use_prefill_decode_attn: # Compute attention and update output up to `num_actual_tokens`. - chunked_prefill_paged_decode(query=query[:num_actual_tokens], - key=key[:num_actual_tokens], - value=value[:num_actual_tokens], - output=output[:num_actual_tokens], - kv_cache_dtype=self.kv_cache_dtype, - key_cache=key_cache, - value_cache=value_cache, - block_table=block_table, - query_start_loc=cu_seqlens_q, - seq_lens=seqused_k, - max_seq_len=max_seqlen_k, - max_query_len=max_seqlen_q, - k_scale=layer._k_scale, - v_scale=layer._v_scale, - alibi_slopes=self.alibi_slopes, - sliding_window=self.sliding_window[0], - sm_scale=self.scale) + chunked_prefill_paged_decode( + query=query[:num_actual_tokens], + key=key[:num_actual_tokens], + value=value[:num_actual_tokens], + output=output[:num_actual_tokens], + kv_cache_dtype=self.kv_cache_dtype, + key_cache=key_cache, + value_cache=value_cache, + block_table=block_table, + query_start_loc=cu_seqlens_q, + seq_lens=seqused_k, + max_seq_len=max_seqlen_k, + max_query_len=max_seqlen_q, + k_scale=layer._k_scale, + v_scale=layer._v_scale, + alibi_slopes=self.alibi_slopes, + sliding_window=self.sliding_window[0], + sm_scale=self.scale, + sinks=self.sinks, + ) else: descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1]) - unified_attention( + self.unified_attention( q=query[:num_actual_tokens], k=key_cache, v=value_cache, @@ -381,6 +417,7 @@ class TritonAttentionImpl(AttentionImpl): q_descale=None, # Not supported k_descale=layer._k_scale.expand(descale_shape), v_descale=layer._v_scale.expand(descale_shape), + sinks=self.sinks, ) return output diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 7aeea40b25..f521d94331 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -254,7 +254,11 @@ def get_kv_cache_layout(): # Override with format specified by the user. cache_layout = envs.VLLM_KV_CACHE_LAYOUT if cache_layout is None: - cache_layout = get_kv_connector_cache_layout() + if (envs.VLLM_USE_TRTLLM_CONTEXT_ATTENTION + or envs.VLLM_USE_TRTLLM_DECODE_ATTENTION): + cache_layout = "HND" + else: + cache_layout = get_kv_connector_cache_layout() else: logger.info_once("`VLLM_KV_CACHE_LAYOUT` environment variable " \ "detected. Setting KV cache layout to %s.", cache_layout) @@ -272,7 +276,9 @@ def set_kv_cache_layout(cache_layout: str): class PerLayerParameters: """ Currently, FlashInfer backend only support models in which all layers share - the same values for the following hyperparameters. + the same values for the following hyperparameters. Should not be used for + trtllm-gen backend since it supports different values for the following + hyperparameters. """ window_left: int @@ -310,7 +316,8 @@ def get_per_layer_parameters( def infer_global_hyperparameters( per_layer_params: dict[str, PerLayerParameters]) -> PerLayerParameters: """ - Currently, FlashInfer backend only support models in which all layers share + Currently, FlashInfer backend other than trtllm-gen + only support models in which all layers share the same values for the following hyperparameters: - `window_left` - `logits_soft_cap` @@ -324,15 +331,20 @@ def infer_global_hyperparameters( param_sets = list(per_layer_params.values()) global_params = param_sets[0] - for params in param_sets: - if params.window_left != global_params.window_left: - raise ValueError( - "Window left is not the same for all layers. One potential fix " - "is to set disable_sliding_window=True") - assert params == global_params, ( - "FlashInfer backend currently only supports models in which all " - "layers share the same values for the following hyperparameters: " - "`window_left`, `logits_soft_cap`, `sm_scale`.") + + # trtllm attention doesn't need global hyper params so disable the check + if (not envs.VLLM_USE_TRTLLM_CONTEXT_ATTENTION + and not envs.VLLM_USE_TRTLLM_DECODE_ATTENTION): + for params in param_sets: + if params.window_left != global_params.window_left: + raise ValueError( + "Window left is not the same for all layers. " \ + "One potential fix is to set disable_sliding_window=True") + assert params == global_params, ( + "FlashInfer backend currently only supports models in which all" + "layers share the same values " + "for the following hyperparameters:" + "`window_left`, `logits_soft_cap`, `sm_scale`.") return global_params From 796bae07c59716b7b61d57343826bfbeabdd01bb Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 6 Aug 2025 06:56:14 +0100 Subject: [PATCH 013/932] Update transformers to `v4.55` (#21931) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: DarkLight1337 Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: isotr0py <2037008807@qq.com> Signed-off-by: Isotr0py Co-authored-by: DarkLight1337 Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py Co-authored-by: Woosuk Kwon --- requirements/common.txt | 2 +- requirements/test.in | 2 +- requirements/test.txt | 6 +- .../multimodal/generation/test_common.py | 4 + tests/models/registry.py | 24 ++- tests/quantization/test_experts_int8.py | 4 + vllm/model_executor/models/interfaces_base.py | 12 +- vllm/model_executor/models/qwen2_vl.py | 11 +- vllm/model_executor/models/transformers.py | 17 +- vllm/model_executor/models/utils.py | 10 +- vllm/transformers_utils/config.py | 4 +- vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/ovis.py | 176 ++++++++++++++++++ 13 files changed, 235 insertions(+), 39 deletions(-) create mode 100644 vllm/transformers_utils/configs/ovis.py diff --git a/requirements/common.txt b/requirements/common.txt index c5eb6dab95..0a4b27c034 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -7,7 +7,7 @@ requests >= 2.26.0 tqdm blake3 py-cpuinfo -transformers >= 4.53.2 +transformers >= 4.55.0 huggingface-hub[hf_xet] >= 0.33.0 # Required for Xet downloads. tokenizers >= 0.21.1 # Required for fast incremental detokenization. protobuf # Required by LlamaTokenizer. diff --git a/requirements/test.in b/requirements/test.in index 9ecaaae927..9c8c75dd6f 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -35,7 +35,7 @@ opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.8 # required for model evaluation test mteb[bm25s]>=1.38.11, <2 # required for mteb test -transformers==4.53.2 +transformers==4.55.0 tokenizers==0.21.1 huggingface-hub[hf_xet]>=0.33.0 # Required for Xet downloads. schemathesis>=3.39.15 # Required for openai schema test. diff --git a/requirements/test.txt b/requirements/test.txt index 691420df87..08ba964f22 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -214,7 +214,7 @@ fiona==1.10.1 # via torchgeo flask==3.1.1 # via mlflow -fonttools==4.54.1 +fonttools==4.55.0 # via matplotlib fqdn==1.5.1 # via jsonschema @@ -286,7 +286,7 @@ httpx==0.27.2 # via # -r requirements/test.in # schemathesis -huggingface-hub==0.33.1 +huggingface-hub==0.34.3 # via # -r requirements/test.in # accelerate @@ -1148,7 +1148,7 @@ tqdm==4.66.6 # transformers tqdm-multiprocess==0.0.11 # via lm-eval -transformers==4.53.2 +transformers==4.55.0 # via # -r requirements/test.in # genai-perf diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 8cb826c114..2a65d7e244 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -337,6 +337,10 @@ VLM_TEST_SETTINGS = { vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, num_logprobs=10, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + # FIXME(Isotr0py): This model is broken in Transformers v4.54.1, we + # should enable this again after the fix is released: + # https://github.com/huggingface/transformers/pull/39915 + marks=[pytest.mark.skip("HF model is broken")], ), "gemma3": VLMTestInfo( models=["google/gemma-3-4b-it"], diff --git a/tests/models/registry.py b/tests/models/registry.py index 47057d32e9..92a719d7a9 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -179,8 +179,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { min_transformers_version="4.54"), "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501 "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), - "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base", - min_transformers_version="4.53"), + "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base"), "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"), "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"), "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"), @@ -223,7 +222,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"), "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini", - extras={"tiny": "ai21labs/Jamba-tiny-dev"}), # noqa: E501 + extras={ + "tiny": "ai21labs/Jamba-tiny-dev", + "random": "ai21labs/Jamba-tiny-random", # noqa: E501 + }), "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct", extras={"guard": "meta-llama/Llama-Guard-3-1B", # noqa: E501 "hermes": "NousResearch/Hermes-3-Llama-3.1-8B", # noqa: E501 @@ -239,8 +241,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B", trust_remote_code=True), - "MiniMaxForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01-hf", - min_transformers_version="4.53"), + "MiniMaxForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01-hf"), "MiniMaxText01ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01", trust_remote_code=True, revision="a59aa9cbc53b9fb8742ca4e9e1531b9802b6fdc3"), # noqa: E501 @@ -272,6 +273,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct", trust_remote_code=True), "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b", + max_transformers_version="4.53", + transformers_version_reason="vLLM impl inherits PreTrainedModel and clashes with get_input_embeddings", # noqa: E501 trust_remote_code=True), "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat", trust_remote_code=True), @@ -299,8 +302,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"), "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True), - "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst", - min_transformers_version="4.53"), + "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"), # [Encoder-decoder] "BartModel": _HfExamplesInfo("facebook/bart-base"), "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"), @@ -326,8 +328,12 @@ _EMBEDDING_EXAMPLE_MODELS = { "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe", trust_remote_code=True, v0_only=True), # noqa: E501 "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"), - "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"), - "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"), + "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B", + max_transformers_version="4.53", + transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"), # noqa: E501 + "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B", + max_transformers_version="4.53", + transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"), # noqa: E501 "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True), # noqa: E501 "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True), # noqa: E501 "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True), # noqa: E501 diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py index 84a656a3b9..1e3e69e008 100644 --- a/tests/quantization/test_experts_int8.py +++ b/tests/quantization/test_experts_int8.py @@ -9,6 +9,8 @@ import pytest from tests.quantization.utils import is_quant_method_supported +from ..models.registry import HF_EXAMPLE_MODELS + MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"] @@ -25,6 +27,8 @@ def test_model_experts_int8_startup( dtype: str, max_tokens: int, ) -> None: + model_info = HF_EXAMPLE_MODELS.find_hf_info(model) + model_info.check_transformers_version(on_fail="skip") with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model: diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 4d68227b2a..697fa020de 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, +from typing import (TYPE_CHECKING, Any, ClassVar, Literal, Optional, Protocol, Union, overload, runtime_checkable) import torch @@ -14,6 +14,10 @@ if TYPE_CHECKING: from vllm.config import VllmConfig from vllm.model_executor.layers.pooler import Pooler from vllm.model_executor.sampling_metadata import SamplingMetadata +else: + VllmConfig = Any + Pooler = Any + SamplingMetadata = Any logger = init_logger(__name__) @@ -34,7 +38,7 @@ class VllmModel(Protocol[T_co]): def __init__( self, - vllm_config: "VllmConfig", + vllm_config: VllmConfig, prefix: str = "", ) -> None: ... @@ -96,7 +100,7 @@ class VllmModelForTextGeneration(VllmModel[T], Protocol[T]): def compute_logits( self, hidden_states: T, - sampling_metadata: "SamplingMetadata", + sampling_metadata: SamplingMetadata, ) -> Optional[T]: """Return `None` if TP rank > 0.""" ... @@ -140,7 +144,7 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]): MRO of your model class. """ - pooler: "Pooler" + pooler: Pooler """The pooler is only called on TP rank 0.""" diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 40d77312b7..633f8598e8 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1395,11 +1395,12 @@ class Tarsier2Processor(Qwen2VLProcessor): **kwargs, ): self.image_processor = Tarsier2ImageProcessor(**vision_config) - super().__init__(image_processor=self.image_processor, - tokenizer=tokenizer, - video_processor=Qwen2VLVideoProcessor(), - chat_template=None, - **kwargs) + super().__init__( + image_processor=self.image_processor, + tokenizer=tokenizer, + video_processor=Qwen2VLVideoProcessor(**vision_config), + chat_template=None, + **kwargs) class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo): diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 5059d1e1d9..0c3df267ed 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -90,7 +90,7 @@ def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module): def replace_linear_class( linear: nn.Linear, style: Literal["colwise", "rowwise"], quant_config: QuantizationConfig -) -> Union[ColumnParallelLinear, RowParallelLinear]: +) -> Union[ColumnParallelLinear, RowParallelLinear, ReplicatedLinear]: """ Replace nn.Linear with one of vLLM's tensor parallel linear classes. @@ -445,7 +445,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): # Set correct attn and init on "meta" to delay allocating GPU tensors # TODO: @raushan, use the public `model.set_attn_implementation()` - # method after v4.54.0 is released + # method once its checks are fixed in Transformers. self.text_config._attn_implementation = "vllm" with init_on_device_without_buffers("meta"), config_override: self.model: PreTrainedModel = AutoModel.from_config( @@ -520,7 +520,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): for i in range(len(layers)): if start_layer <= i and i < end_layer: continue - layers[i] = PPMissingLayer(return_tuple=True) + layers[i] = PPMissingLayer() # Layers after module list for name in pp_plan[module_list_idx + 1:]: @@ -533,14 +533,16 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): Apply the model's tensor parallelization plan. Currently only supports linear layers. """ - if not self.model.supports_tp_plan: - if self.tp_size <= 1: - return + tp_plan = getattr(self.model.config, "base_model_tp_plan", None) or {} + if not tp_plan and self.tp_size > 1: raise ValueError( f"{type(self.model)} does not support tensor parallel yet!") - tp_plan = self.model._tp_plan + # Some weight loaders expect linear layers to inherit from vLLM's + # LinearBase class, so we set a default style which causes any + # unspecified linear layers to be replaced with ReplicatedLinear + tp_plan[".*"] = "replicated" def _tensor_parallel(module: nn.Module, prefix: str = ""): for child_name, child_module in module.named_children(): @@ -552,6 +554,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): child_module, style, self.quant_config) setattr(module, child_name, new_module) log_replacement(qual_name, child_module, new_module) + break else: _tensor_parallel(child_module, prefix=qual_name) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 28508e1bac..fecd14dde4 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -534,16 +534,10 @@ class PPMissingLayer(torch.nn.Identity): def __init__(self, *args, **kwargs): super().__init__() - self.return_tuple = kwargs.get("return_tuple", False) def forward(self, *args, **kwargs): - """ - Return the first arg from args or the first value from kwargs. - - Wraps the input in a tuple if `self.return_tuple` is True. - """ - input = args[0] if args else next(iter(kwargs.values())) - return (input, ) if self.return_tuple else input + """Return the first arg from args or the first value from kwargs.""" + return args[0] if args else next(iter(kwargs.values())) _CPU_OFFLOAD_BYTES = 0 diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 8fe153464d..bce24ef74c 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -35,7 +35,8 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config, MllamaConfig, MLPSpeculatorConfig, Nemotron_Nano_VL_Config, NemotronConfig, NVLM_D_Config, - RWConfig, SpeculatorsConfig, + OvisConfig, RWConfig, + SpeculatorsConfig, Step3TextConfig, Step3VLConfig, UltravoxConfig) # yapf: enable @@ -85,6 +86,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = { "speculators": SpeculatorsConfig, "nemotron": NemotronConfig, "NVLM_D": NVLM_D_Config, + "ovis": OvisConfig, "ultravox": UltravoxConfig, "step3_vl": Step3VLConfig, "step3_text": Step3TextConfig, diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 64ace167a5..82d24bb16b 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -24,6 +24,7 @@ from vllm.transformers_utils.configs.nemotron import NemotronConfig from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config +from vllm.transformers_utils.configs.ovis import OvisConfig from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig, Step3VisionEncoderConfig, @@ -45,6 +46,7 @@ __all__ = [ "NemotronHConfig", "Nemotron_Nano_VL_Config", "NVLM_D_Config", + "OvisConfig", "SpeculatorsConfig", "UltravoxConfig", "Step3VLConfig", diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py new file mode 100644 index 0000000000..550f5e15db --- /dev/null +++ b/vllm/transformers_utils/configs/ovis.py @@ -0,0 +1,176 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# yapf: disable +# ruff: noqa: E501 +# adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py +# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py +# Ovis Config with AimV2 config registration removed for Transformers compatibility +from typing import Any, Optional, Union + +from transformers import AutoConfig, PretrainedConfig + + +class AIMv2Config(PretrainedConfig): + """This is the configuration class to store the configuration of an [`AIMv2Model`]. + Instantiating a configuration with the defaults will yield a similar configuration + to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224). + Args: + hidden_size: Dimension of the hidden representations. + intermediate_size: Dimension of the SwiGLU representations. + num_hidden_layers: Number of hidden layers in the Transformer. + num_attention_heads: Number of attention heads for each attention layer + in the Transformer. + num_channels: Number of input channels. + image_size: Image size. + patch_size: Patch size. + rms_norm_eps: Epsilon value used for the RMS normalization layer. + attention_dropout: Dropout ratio for attention probabilities. + projection_dropout: Dropout ratio for the projection layer after the attention. + qkv_bias: Whether to add a bias to the queries, keys and values. + use_bias: Whether to add a bias in the feed-forward and projection layers. + kwargs: Keyword arguments for the [`PretrainedConfig`]. + """ + + model_type: str = "aimv2" + + def __init__( + self, + hidden_size: int = 1024, + intermediate_size: int = 2816, + num_hidden_layers: int = 24, + num_attention_heads: int = 8, + num_channels: int = 3, + image_size: int = 224, + patch_size: int = 14, + rms_norm_eps: float = 1e-5, + attention_dropout: float = 0.0, + projection_dropout: float = 0.0, + qkv_bias: bool = False, + use_bias: bool = False, + **kwargs: Any, + ): + super().__init__(**kwargs) + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.attention_dropout = attention_dropout + self.rms_norm_eps = rms_norm_eps + + self.projection_dropout = projection_dropout + self.qkv_bias = qkv_bias + self.use_bias = use_bias + + +# ---------------------------------------------------------------------- +# Visual Tokenizer Configuration +# ---------------------------------------------------------------------- +class BaseVisualTokenizerConfig(PretrainedConfig): + + def __init__(self, + vocab_size=16384, + tokenize_function="softmax", + tau=1.0, + depths=None, + drop_cls_token=False, + backbone_config: Optional[Union[PretrainedConfig, + dict]] = None, + hidden_stride: int = 1, + **kwargs): + super().__init__(**kwargs) + self.vocab_size = vocab_size + self.tokenize_function = tokenize_function + self.tau = tau + if isinstance(depths, str): + depths = [int(x) for x in depths.split('|')] + self.depths = depths + self.backbone_kwargs = dict[str, Any]() + self.drop_cls_token = drop_cls_token + if backbone_config is not None: + assert isinstance(backbone_config, (PretrainedConfig, dict)), \ + f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type" + if not isinstance(backbone_config, PretrainedConfig): + model_type = backbone_config['model_type'] + if model_type != "aimv2": + backbone_config.pop('model_type') + backbone_config = AutoConfig.for_model(model_type, **backbone_config) + else: + backbone_config = AIMv2Config(**backbone_config) + self.backbone_config = backbone_config + self.hidden_stride = hidden_stride + + +class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig): + model_type = "aimv2_visual_tokenizer" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + if self.drop_cls_token: + self.drop_cls_token = False + if self.depths: + assert len(self.depths) == 1 + self.backbone_kwargs['num_hidden_layers'] = self.depths[0] + + +class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig): + model_type = "siglip_visual_tokenizer" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + if self.drop_cls_token: + self.drop_cls_token = False + if self.depths: + assert len(self.depths) == 1 + self.backbone_kwargs['num_hidden_layers'] = self.depths[0] + + +AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig) +AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig) + + +# ---------------------------------------------------------------------- +# Ovis Configuration +# ---------------------------------------------------------------------- +class OvisConfig(PretrainedConfig): + model_type = "ovis" + + def __init__(self, + llm_config: Optional[Union[PretrainedConfig, dict]] = None, + visual_tokenizer_config: Optional[Union[PretrainedConfig, + dict]] = None, + multimodal_max_length=8192, + hidden_size=None, + conversation_formatter_class=None, + llm_attn_implementation=None, + disable_tie_weight=False, + **kwargs): + super().__init__(**kwargs) + if llm_config is not None: + assert isinstance(llm_config, (PretrainedConfig, dict)), \ + f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type" + if not isinstance(llm_config, PretrainedConfig): + model_type = llm_config['model_type'] + llm_config.pop('model_type') + llm_config = AutoConfig.for_model(model_type, **llm_config) + + # map llm_config to text_config + self.text_config = llm_config + if visual_tokenizer_config is not None: + assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \ + f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type" + if not isinstance(visual_tokenizer_config, PretrainedConfig): + model_type = visual_tokenizer_config['model_type'] + visual_tokenizer_config.pop('model_type') + visual_tokenizer_config = AutoConfig.for_model( + model_type, **visual_tokenizer_config) + + self.visual_tokenizer_config = visual_tokenizer_config + self.multimodal_max_length = multimodal_max_length + self.hidden_size = hidden_size + self.conversation_formatter_class = conversation_formatter_class + self.llm_attn_implementation = llm_attn_implementation + self.disable_tie_weight = disable_tie_weight From de98252f497b8cde5b9f18a8dac53302f5c72db7 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 5 Aug 2025 23:26:00 -0700 Subject: [PATCH 014/932] Add GPT-OSS model code and config [1/N] (#22327) Signed-off-by: Woosuk Kwon --- tests/models/registry.py | 1 + vllm/model_executor/models/config.py | 29 ++ vllm/model_executor/models/gpt_oss.py | 472 +++++++++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 4 files changed, 503 insertions(+) create mode 100644 vllm/model_executor/models/gpt_oss.py diff --git a/tests/models/registry.py b/tests/models/registry.py index 92a719d7a9..69961d7385 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -197,6 +197,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { {"6b": "EleutherAI/gpt-j-6b"}), "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m", {"1b": "EleutherAI/pythia-1.4b"}), + "GptOssForCausalLM": _HfExamplesInfo("openai/gpt-oss-20b"), "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"), "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"), "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"), # noqa: E501 diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 6f09be7a59..908d4e628b 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -247,6 +247,34 @@ class GraniteMoeHybridModelConfig(VerifyAndUpdateConfig): config.max_model_len) +class GptOssConfig(VerifyAndUpdateConfig): + + @staticmethod + def verify_and_update_config(vllm_config: "VllmConfig") -> None: + decoding_config = vllm_config.decoding_config + if decoding_config.reasoning_backend == "": + decoding_config.reasoning_backend = "openai" + + # Increase the max capture size from 512 to 1024 for performance. + # NOTE(woosuk): This will increase the number of CUDA graphs + # from 67 to 83. + scheduler_config = vllm_config.scheduler_config + if len(scheduler_config.cuda_graph_sizes) == 1: + max_capture_size = scheduler_config.cuda_graph_sizes[0] + # FIXME(woosuk): When using full cuda graph with FA3, the max + # supported size is 992. + if max_capture_size < 1024: + cuda_graph_sizes = [1, 2, 4] + # Step size 8 for small batch sizes + cuda_graph_sizes += [i for i in range(8, 256, 8)] + # Step size 16 for larger batch sizes + cuda_graph_sizes += [i for i in range(256, 1025, 16)] + scheduler_config.cuda_graph_sizes = cuda_graph_sizes + logger.info( + "Overriding max cuda graph capture size to " + "%d for performance.", 1024) + + class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): @classmethod @@ -345,4 +373,5 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "JinaVLForRanking": JinaVLForSequenceClassificationConfig, "JambaForSequenceClassification": JambaForSequenceClassificationConfig, "GraniteMoeHybridForCausalLM": GraniteMoeHybridModelConfig, + "GptOssForCausalLM": GptOssConfig, } diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py new file mode 100644 index 0000000000..896560fa24 --- /dev/null +++ b/vllm/model_executor/models/gpt_oss.py @@ -0,0 +1,472 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable +from typing import Optional + +import torch +import torch.distributed as dist +from torch import nn +from transformers import GptOssConfig + +from vllm import envs +from vllm.attention import Attention, AttentionType +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import (get_ep_group, get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size) +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors +from vllm.utils import cdiv + +from .utils import extract_layer_index, maybe_prefix + + +class OAIAttention(nn.Module): + + def __init__( + self, + config: GptOssConfig, + quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, + prefix: str = "", + ): + super().__init__() + self.layer_idx = extract_layer_index(prefix) + self.head_dim = config.head_dim + self.num_attention_heads = config.num_attention_heads + self.num_key_value_heads = config.num_key_value_heads + self.hidden_size = config.hidden_size + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=config.max_position_embeddings, + base=config.rope_theta, + dtype=torch.float32, + rope_scaling={ + "rope_type": + "yarn", + "factor": + config.rope_scaling["factor"], + "original_max_position_embeddings": + config.rope_scaling["original_max_position_embeddings"], + "beta_fast": + config.rope_ntk_beta, + "beta_slow": + config.rope_ntk_alpha, + }, + is_neox_style=True, + ) + + tp_size = get_tensor_model_parallel_world_size() + + attention_sink_dtype = ( + torch.float32 if envs.VLLM_USE_TRTLLM_CONTEXT_ATTENTION + or envs.VLLM_USE_TRTLLM_DECODE_ATTENTION else torch.bfloat16) + self.sinks = torch.nn.Parameter( + torch.empty(config.num_attention_heads // tp_size, + dtype=attention_sink_dtype, + requires_grad=False)) + + self.norm = RMSNorm(config.hidden_size, eps=1e-5) + + self.q_size = self.num_attention_heads * self.head_dim // tp_size + self.kv_size = self.num_key_value_heads * self.head_dim // tp_size + self.scaling = self.head_dim**-0.5 + self.rope_theta = config.rope_theta + + self.qkv = QKVParallelLinear( + hidden_size=self.hidden_size, + head_size=self.head_dim, + total_num_heads=self.num_attention_heads, + total_num_kv_heads=self.num_key_value_heads, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + input_size=self.num_attention_heads * self.head_dim, + output_size=self.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + self.num_local_attention_heads = config.num_attention_heads // tp_size + self.num_local_key_value_heads = config.num_key_value_heads // tp_size + + # Only apply sliding window to every other layer + sliding_window = (config.sliding_window if self.layer_idx % + 2 == 0 else None) + self.attn = Attention( + self.num_local_attention_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_local_key_value_heads, + cache_config=cache_config, + quant_config=quant_config, + per_layer_sliding_window=sliding_window, + attn_type=AttentionType.DECODER, + prefix=f"{prefix}.attn", + sinks=self.sinks, + ) + + def forward(self, hidden_states: torch.Tensor, + positions: torch.Tensor) -> torch.Tensor: + t = self.norm(hidden_states) + + qkv, _ = self.qkv(t) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + v = v.contiguous() + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + + return output + hidden_states + + +class MLPBlock(torch.nn.Module): + + def __init__( + self, + config: GptOssConfig, + layer_idx: int, + quant_config: QuantizationConfig, + prefix: str = "", + ): + super().__init__() + self.layer_idx = layer_idx + self.num_experts = config.num_local_experts + self.experts_per_token = config.num_experts_per_tok + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + self.norm = RMSNorm(config.hidden_size, eps=1e-5) + self.router = torch.nn.Linear(config.hidden_size, + config.num_local_experts, + dtype=torch.bfloat16) + assert config.intermediate_size % self.world_size == 0 + self.experts = FusedMoE(num_experts=config.num_local_experts, + top_k=config.num_experts_per_token, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + reduce_results=True, + renormalize=True, + quant_config=quant_config, + prefix=f"{prefix}.experts", + apply_router_weight_on_input=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + t = self.norm(x) + g = self.router(t) + t = self.experts(hidden_states=t, router_logits=g) + return x + t + + +class TransformerBlock(torch.nn.Module): + + def __init__( + self, + config: GptOssConfig, + quant_config: QuantizationConfig, + prefix: str = "", + ): + super().__init__() + self.layer_idx = extract_layer_index(prefix) + self.attn = OAIAttention(config, prefix=f"{prefix}.attn") + self.mlp = MLPBlock(config, + self.layer_idx, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + + def forward(self, hidden_states: torch.Tensor, + positions: torch.Tensor) -> torch.Tensor: + attn_output = self.attn(hidden_states, positions) + output = self.mlp(attn_output) + return output + + +@support_torch_compile +class GptOssModel(nn.Module): + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + ): + super().__init__() + self.config = vllm_config.model_config.hf_config + self.quant_config = vllm_config.quant_config + self.config.hidden_size = self.config.hidden_size + self.embedding = VocabParallelEmbedding( + self.config.vocab_size, + self.config.hidden_size, + ) + self.layers = torch.nn.ModuleList([ + TransformerBlock( + self.config, + quant_config=self.quant_config, + prefix=maybe_prefix(prefix, f"block.{layer_idx}"), + ) for layer_idx in range(self.config.num_hidden_layers) + ]) + self.norm = RMSNorm(self.config.hidden_size, eps=1e-5) + + def forward(self, input_ids: torch.Tensor, + positions: torch.Tensor) -> torch.Tensor: + x = self.embedding(input_ids) + for layer in self.layers: + x = layer(x, positions) + x = self.norm(x) + return x + + +class GptOssForCausalLM(nn.Module): + + def __init__( + self, + vllm_config: VllmConfig, + prefix: str = "", + ): + super().__init__() + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config.hf_config + self.model = GptOssModel( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model"), + ) + self.lm_head = ParallelLMHead( + self.model_config.vocab_size, + self.model_config.hidden_size, + ) + self.logits_processor = LogitsProcessor(self.model_config.vocab_size) + + def forward(self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None) -> torch.Tensor: + assert intermediate_tensors is None + assert inputs_embeds is None + return self.model(input_ids, positions) + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + rename_mapping = { + "self_attn": "attn", + "input_layernorm.weight": "attn.norm.weight", + "post_attention_layernorm.weight": "mlp.norm.weight", + "embed_tokens": "embedding", + } + + def maybe_rename(name: str) -> str: + for remap_name, new_name in rename_mapping.items(): + if remap_name in name: + return name.replace(remap_name, new_name) + return name + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + mxfp4_block = 32 + + tp_rank = get_tensor_model_parallel_rank() + tp_size = get_tensor_model_parallel_world_size() + intermediate_size = self.model_config.intermediate_size + intermediate_size_block = intermediate_size // mxfp4_block + per_rank_intermediate_size_block = cdiv(intermediate_size_block, + tp_size) + per_rank_intermediate_size = (per_rank_intermediate_size_block * + mxfp4_block) + + # Calculate common slicing bounds for current rank + tp_rank_start = tp_rank * per_rank_intermediate_size + tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, + intermediate_size) + + # Attention heads per rank + heads_per_rank = self.model_config.num_attention_heads // tp_size + head_start = tp_rank * heads_per_rank + + use_ep = self.vllm_config.parallel_config.enable_expert_parallel + ep_size = get_ep_group().world_size + ep_rank = get_ep_group().rank + num_experts = self.model_config.num_local_experts + experts_per_rank = num_experts // ep_size + ep_rank_start = ep_rank * experts_per_rank + ep_rank_end = (ep_rank + 1) * experts_per_rank + + for name, weight in weights: + # FIXME(woosuk): Remove this after testing. + weight = weight.cuda() + + if "gate_up_proj_blocks" in name: + # Handle MLP gate and up projection weights + new_name = name.replace("gate_up_proj_blocks", "w13_weight") + + # flat weight from (E, 2 * N, block_size, entry_per_block) + # to (E, 2 * N, -1), shouldn't trigger copy for contiguous + weight = weight.view(num_experts, 2 * intermediate_size, + -1).contiguous() + + # Extract gate and up projection parts + # since the weight is shuffled, we can slice directly + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, + 2 * tp_rank_start:2 * tp_rank_end, + ...] + + param = params_dict[new_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=new_name, + shard_id=None, + expert_id=None) + loaded_params.add(new_name) + + elif "down_proj_blocks" in name: + # Handle MLP down projection weights + new_name = name.replace("down_proj_blocks", "w2_weight") + # same flatten here, but since 2 mx4 value are packed in 1 + # uint8, divide by 2 + weight = weight.view(num_experts, -1, + intermediate_size // 2).contiguous() + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[..., + tp_rank_start // 2:tp_rank_end // 2] + + param = params_dict[new_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=new_name, + shard_id=None, + expert_id=None) + loaded_params.add(new_name) + + elif "gate_up_proj_scales" in name: + # Handle MLP gate and up projection weights scale + new_name = name.replace("gate_up_proj_scales", + "w13_weight_scale") + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, + 2 * tp_rank_start:2 * tp_rank_end, + ...] + + param = params_dict[new_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=new_name, + shard_id=None, + expert_id=None) + loaded_params.add(new_name) + + elif "down_proj_scales" in name: + # Handle MLP down projection weights + new_name = name.replace("down_proj_scales", "w2_weight_scale") + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[..., tp_rank_start // + mxfp4_block:tp_rank_end // + mxfp4_block] + + param = params_dict[new_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=new_name, + shard_id=None, + expert_id=None) + loaded_params.add(new_name) + elif "gate_up_proj_bias" in name: + # Handle MLP gate and up projection biases + new_name = name.replace("gate_up_proj_bias", "w13_bias") + + # Extract gate and up projection bias parts + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, + 2 * tp_rank_start:2 * tp_rank_end] + + param = params_dict[new_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=new_name, + shard_id=None, + expert_id=None) + loaded_params.add(new_name) + + elif "down_proj_bias" in name: + # Handle MLP down projection bias + new_name = name.replace("down_proj_bias", "w2_bias") + param = params_dict[new_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if use_ep: + weight = weight[ep_rank_start:ep_rank_end, ...] + else: + # (only load on rank 0 to avoid duplication) + if tp_rank != 0: + weight.zero_() + weight_loader(param, + weight, + weight_name=new_name, + shard_id=None, + expert_id=None) + loaded_params.add(new_name) + elif "sinks" in name: + # Handle attention sinks (distributed across ranks) + name = name.replace("self_attn", "attn") + param = params_dict[name] + narrow_weight = weight.narrow(0, head_start, heads_per_rank) + param.data.copy_(narrow_weight) + loaded_params.add(name) + elif "q_proj" in name or "k_proj" in name or "v_proj" in name: + shard_id = ("q" if "q_proj" in name else + "k" if "k_proj" in name else "v") + name = name.replace("self_attn", "attn") + param_name = name.replace(f"{shard_id}_proj", "qkv") + param = params_dict[param_name] + weight_loader = param.weight_loader + weight_loader(param, weight, loaded_shard_id=shard_id) + loaded_params.add(param_name) + else: + # Handle all other weights with potential renaming + renamed_name = maybe_rename(name) + if renamed_name not in params_dict: + continue + param = params_dict[renamed_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, weight) + loaded_params.add(renamed_name) + + return loaded_params diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 9b6ab52d86..c746e8ec3f 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -74,6 +74,7 @@ _TEXT_GENERATION_MODELS = { "GlmForCausalLM": ("glm", "GlmForCausalLM"), "Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"), "Glm4MoeForCausalLM": ("glm4_moe", "Glm4MoeForCausalLM"), + "GptOssForCausalLM": ("gpt_oss", "GptOssForCausalLM"), "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"), "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"), "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"), From 98a3a81024649985ed8814a4b7d083d2303fd73c Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 5 Aug 2025 23:30:38 -0700 Subject: [PATCH 015/932] [ROCm] Add attention sink to use_rocm_custom_paged_attention (#22329) Signed-off-by: Woosuk Kwon Co-authored-by: LiuXiaoxuanPKU Co-authored-by: simon-mo Co-authored-by: Chen Zhang Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com> Co-authored-by: Yongye Zhu --- vllm/platforms/rocm.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 54ffc83cd5..d26e4b3350 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -127,7 +127,8 @@ def use_rocm_custom_paged_attention( max_seq_len: int, sliding_window: int, kv_cache_dtype: str, - alibi_slopes: Optional[torch.Tensor] = None) -> bool: + alibi_slopes: Optional[torch.Tensor] = None, + sinks: Optional[torch.Tensor] = None) -> bool: GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"]) @@ -145,7 +146,7 @@ def use_rocm_custom_paged_attention( and max_seq_len <= 128 * 1024 and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN) and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN - and envs.VLLM_ROCM_USE_AITER)) + and envs.VLLM_ROCM_USE_AITER) and sinks is None) else: return (ON_GFX11_GFX12 and (not envs.VLLM_USE_V1 or sliding_window == 0 @@ -155,7 +156,7 @@ def use_rocm_custom_paged_attention( and (gqa_ratio >= 3 and gqa_ratio <= 16) and max_seq_len <= 128 * 1024 and alibi_slopes is None and kv_cache_dtype == "auto" - and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN) + and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN and sinks is None) class RocmPlatform(Platform): @@ -170,7 +171,7 @@ class RocmPlatform(Platform): supported_quantization: list[str] = [ "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf", - "quark", "ptpc_fp8" + "quark", "ptpc_fp8", "mxfp4" ] @classmethod @@ -469,4 +470,4 @@ class RocmPlatform(Platform): @classmethod def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool: - return True \ No newline at end of file + return True From a47e6ffe9366516ea5ca28e27fc87367a869e854 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Tue, 5 Aug 2025 23:39:13 -0700 Subject: [PATCH 016/932] [GptOss] Add GptOss reasoning parser to support structure output (#22322) Signed-off-by: Chen Zhang Co-authored-by: LiuXiaoxuanPKU Co-authored-by: simon-mo Co-authored-by: Woosuk Kwon Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com> Co-authored-by: Yongye Zhu --- vllm/model_executor/models/config.py | 6 +-- vllm/reasoning/__init__.py | 2 + vllm/reasoning/gptoss_reasoning_parser.py | 64 +++++++++++++++++++++++ 3 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 vllm/reasoning/gptoss_reasoning_parser.py diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 908d4e628b..6f21cd267b 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -247,13 +247,13 @@ class GraniteMoeHybridModelConfig(VerifyAndUpdateConfig): config.max_model_len) -class GptOssConfig(VerifyAndUpdateConfig): +class GptOssForCausalLMConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_config(vllm_config: "VllmConfig") -> None: decoding_config = vllm_config.decoding_config if decoding_config.reasoning_backend == "": - decoding_config.reasoning_backend = "openai" + decoding_config.reasoning_backend = "GptOss" # Increase the max capture size from 512 to 1024 for performance. # NOTE(woosuk): This will increase the number of CUDA graphs @@ -373,5 +373,5 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "JinaVLForRanking": JinaVLForSequenceClassificationConfig, "JambaForSequenceClassification": JambaForSequenceClassificationConfig, "GraniteMoeHybridForCausalLM": GraniteMoeHybridModelConfig, - "GptOssForCausalLM": GptOssConfig, + "GptOssForCausalLM": GptOssForCausalLMConfig, } diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py index 1c3f78f2ed..b987adeb64 100644 --- a/vllm/reasoning/__init__.py +++ b/vllm/reasoning/__init__.py @@ -4,6 +4,7 @@ from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser from .glm4_moe_reasoning_parser import Glm4MoeModelReasoningParser +from .gptoss_reasoning_parser import GptOssReasoningParser from .granite_reasoning_parser import GraniteReasoningParser from .hunyuan_a13b_reasoning_parser import HunyuanA13BReasoningParser from .mistral_reasoning_parser import MistralReasoningParser @@ -20,4 +21,5 @@ __all__ = [ "Glm4MoeModelReasoningParser", "MistralReasoningParser", "Step3ReasoningParser", + "GptOssReasoningParser", ] diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py new file mode 100644 index 0000000000..05a72ac23b --- /dev/null +++ b/vllm/reasoning/gptoss_reasoning_parser.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Sequence +from typing import Optional, Union + +from transformers import PreTrainedTokenizerBase + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage) +from vllm.logger import init_logger +from vllm.reasoning import ReasoningParser, ReasoningParserManager + +logger = init_logger(__name__) + + +@ReasoningParserManager.register_module("GptOss") +class GptOssReasoningParser(ReasoningParser): + """ + Reasoning parser for GptOss model. + + The GptOss model uses harmony to extract reasoning content and this parser + is only used for detecting the end of the reasoning content. + """ + + def __init__(self, tokenizer: PreTrainedTokenizerBase): + super().__init__(tokenizer) + self.reasoning_end_token_ids = self.model_tokenizer.encode( + "<|start|>assistant<|channel|>final<|message|>") + + def is_reasoning_end(self, input_ids: list[int]) -> bool: + end_token_ids = self.reasoning_end_token_ids + assert len(end_token_ids) > 0, "reasoning_end_token_ids is empty" + # Check if the end sequence is present in the input_ids. + # We search from the end of input_ids to find the last match. + for i in range(len(input_ids) - len(end_token_ids), -1, -1): + if input_ids[i:i + len(end_token_ids)] == end_token_ids: + return True + return False + + def extract_content_ids(self, input_ids: list[int]) -> list[int]: + raise RuntimeError( + "GptOss model uses harmony to extract reasoning content. This " + "function should not be called.") + + def extract_reasoning_content_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> Union[DeltaMessage, None]: + raise RuntimeError( + "GptOss model uses harmony to extract reasoning content. This " + "function should not be called.") + + def extract_reasoning_content( + self, model_output: str, request: ChatCompletionRequest + ) -> tuple[Optional[str], Optional[str]]: + raise RuntimeError( + "GptOss model uses harmony to extract reasoning content. This " + "function should not be called.") From 90ec006937c4bcb33b4c0423285fd72502659cfe Mon Sep 17 00:00:00 2001 From: Yongye Zhu Date: Tue, 5 Aug 2025 23:48:19 -0700 Subject: [PATCH 017/932] [gpt-oss] flashinfer attention sink init (#22330) Signed-off-by: simon-mo Co-authored-by: LiuXiaoxuanPKU Co-authored-by: simon-mo Co-authored-by: Chen Zhang Co-authored-by: Woosuk Kwon Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com> --- vllm/v1/attention/backends/flashinfer.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 8592d1b26d..caf9ecc911 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -611,6 +611,7 @@ class FlashInferImpl(AttentionImpl): logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[int] = None, + sinks: Optional[torch.Tensor] = None, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -635,6 +636,15 @@ class FlashInferImpl(AttentionImpl): "are not implemented for " "FlashInferImpl") + self.sinks: Optional[torch.Tensor] = None + if sinks is not None: + assert sinks.shape[0] == num_heads, ( + "Sinks must have the same number of heads " + "as the number of heads in the layer" + ) + assert sinks.dtype == torch.float32, "Sinks must be of type float32" + self.sinks = sinks + def forward( self, layer: torch.nn.Module, From 134a8ee8fdbcbb838a54911fd2b129f2ceda0f17 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 6 Aug 2025 00:10:14 -0700 Subject: [PATCH 018/932] [gpt-oss] Add openai-harmony as default dependency (#22332) Signed-off-by: Woosuk Kwon Co-authored-by: LiuXiaoxuanPKU Co-authored-by: simon-mo Co-authored-by: Chen Zhang Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com> Co-authored-by: Yongye Zhu --- requirements/common.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/common.txt b/requirements/common.txt index 0a4b27c034..5405df359a 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -49,3 +49,4 @@ ninja # Required for xgrammar, rocm, tpu, xpu pybase64 # fast base64 implementation cbor2 # Required for cross-language serialization of hashable objects setproctitle # Used to set process names for better debugging and monitoring +openai-harmony >= 0.0.3 # Required for gpt-oss From fa00c5d75bc63c87f5822f839db1342f19e4acc8 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 6 Aug 2025 15:50:25 +0800 Subject: [PATCH 019/932] [Misc] Clean up duplicated hf overrides (#22311) Signed-off-by: Isotr0py <2037008807@qq.com> --- tests/models/multimodal/test_tensor_schema.py | 51 +-------------- tests/models/test_initialization.py | 62 +++---------------- tests/models/utils.py | 61 ++++++++++++++++++ 3 files changed, 71 insertions(+), 103 deletions(-) diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py index f80e8456f0..a4cb1a6883 100644 --- a/tests/models/multimodal/test_tensor_schema.py +++ b/tests/models/multimodal/test_tensor_schema.py @@ -1,11 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from functools import partial -from typing import Any from unittest.mock import patch import pytest -from transformers import PretrainedConfig from vllm.config import ModelConfig from vllm.engine.llm_engine import LLMEngine as V0LLMEngine @@ -19,6 +17,7 @@ from vllm.v1.engine.core import EngineCore as V1EngineCore from ...conftest import VllmRunner from ..registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS +from ..utils import dummy_hf_overrides ARCH_TO_SKIP = { "MolmoForCausalLM": "incompatible requirements", @@ -51,51 +50,6 @@ def create_batched_mm_kwargs( return mm_kwargs -# Avoid OOM and reduce initialization time by only using 1 layer -def hf_overrides(hf_config: PretrainedConfig, - exist_overrides: dict[str, Any]) -> PretrainedConfig: - hf_config.update(exist_overrides) - text_config = hf_config.get_text_config() - # Ensure at least 2 expert per group - # Since `grouped_topk` assumes top-2 - n_group = getattr(text_config, 'n_group', None) - num_experts = n_group * 2 if n_group is not None else 2 - # we use three layers for Gemma-3n to check - # both normal layer and kv_shared_layer - text_config.update({ - "num_layers": 1, - "num_hidden_layers": 1, - "num_experts": num_experts, - "num_experts_per_tok": 2, - "num_local_experts": num_experts, - # Otherwise there will not be any expert layers - "first_k_dense_replace": 0, - # To avoid OOM on DeepSeek-V3 - "n_routed_experts": num_experts, - # For Gemma-3n - "num_kv_shared_layers": 1, - }) - if hasattr(hf_config, "vision_config"): - hf_config.vision_config.update({ - "num_layers": 1, - "num_hidden_layers": 1, - }) - # e.g.: ibm-granite/granite-speech-3.3-2b - if hasattr(hf_config, "encoder_config"): - hf_config.encoder_config.update({ - "num_layers": 1, - "num_hidden_layers": 1, - }) - # e.g.: Qwen/Qwen2-Audio-7B-Instruct - if hasattr(hf_config, "audio_config"): - hf_config.audio_config.update({ - "num_layers": 1, - "num_hidden_layers": 1, - "encoder_layers": 1, - }) - return hf_config - - @pytest.mark.core_model @pytest.mark.parametrize("model_arch", list(_MULTIMODAL_EXAMPLE_MODELS.keys())) def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner], @@ -110,7 +64,8 @@ def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner], model_id = model_info.default - hf_overrides_fn = partial(hf_overrides, + hf_overrides_fn = partial(dummy_hf_overrides, + model_arch=model_arch, exist_overrides=model_info.hf_overrides) model_config = ModelConfig( diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 4c7da24fca..f0aa91566b 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from functools import partial from unittest.mock import patch import pytest -from transformers import PretrainedConfig from vllm import LLM from vllm.config import ModelImpl @@ -16,6 +16,7 @@ from vllm.v1.engine.core import EngineCore as V1EngineCore from ..utils import create_new_process_for_each_test from .registry import (_TRANSFORMERS_BACKEND_MODELS, AUTO_EXAMPLE_MODELS, HF_EXAMPLE_MODELS, HfExampleModels) +from .utils import dummy_hf_overrides @create_new_process_for_each_test() @@ -33,64 +34,15 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") + hf_overrides_fn = partial(dummy_hf_overrides, + model_arch=model_arch, + exist_overrides=model_info.hf_overrides) + if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"): from vllm.model_executor.models.llama4 import Llama4ForCausalLM from vllm.model_executor.models.registry import ModelRegistry ModelRegistry.register_model("Llama4ForCausalLM", Llama4ForCausalLM) - # Avoid OOM and reduce initialization time by only using 1 layer - def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: - hf_config.update(model_info.hf_overrides) - - text_config = hf_config.get_text_config() - - # Ensure at least 2 expert per group - # Since `grouped_topk` assumes top-2 - n_group = getattr(text_config, 'n_group', None) - num_experts = n_group * 2 if n_group is not None else 2 - - # we use three layers for Gemma-3n to check - # both normal layer and kv_shared_layer - num_hidden_layers = (3 if model_arch - == "Gemma3nForConditionalGeneration" else 1) - - text_config.update({ - "num_layers": 1, - "num_hidden_layers": num_hidden_layers, - "num_experts": num_experts, - "num_experts_per_tok": 2, - "num_local_experts": num_experts, - # Otherwise there will not be any expert layers - "first_k_dense_replace": 0, - # To avoid OOM on DeepSeek-V3 - "n_routed_experts": num_experts, - # For Gemma-3n - "num_kv_shared_layers": 1, - }) - - if hasattr(hf_config, "vision_config"): - hf_config.vision_config.update({ - "num_layers": 1, - "num_hidden_layers": 1, - }) - - # e.g.: ibm-granite/granite-speech-3.3-2b - if hasattr(hf_config, "encoder_config"): - hf_config.encoder_config.update({ - "num_layers": 1, - "num_hidden_layers": 1, - }) - - # e.g.: Qwen/Qwen2-Audio-7B-Instruct - if hasattr(hf_config, "audio_config"): - hf_config.audio_config.update({ - "num_layers": 1, - "num_hidden_layers": 1, - "encoder_layers": 1, - }) - - return hf_config - # Avoid calling model.forward() def _initialize_kv_caches_v0(self) -> None: self.cache_config.num_gpu_blocks = 0 @@ -132,7 +84,7 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, load_format="dummy", model_impl=ModelImpl.TRANSFORMERS if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM, - hf_overrides=hf_overrides, + hf_overrides=hf_overrides_fn, ) diff --git a/tests/models/utils.py b/tests/models/utils.py index bda7ea3e3a..1513db5220 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -7,6 +7,7 @@ from typing import Any, NamedTuple, Optional, Union import torch import torch.nn.functional as F +from transformers import PretrainedConfig from vllm.config import ModelConfig, RunnerOption from vllm.inputs import InputContext @@ -351,3 +352,63 @@ class RerankModelInfo(NamedTuple): architecture: str = "" dtype: str = "auto" enable_test: bool = True + + +def dummy_hf_overrides( + hf_config: PretrainedConfig, + model_arch: str, + exist_overrides: Optional[dict[str, Any]] = None, +) -> PretrainedConfig: + """ + Dummy HF overrides function used to create dummy model + with only minimum nums of layer. + """ + hf_config.update(exist_overrides or {}) + + text_config = hf_config.get_text_config() + + # Ensure at least 2 expert per group + # Since `grouped_topk` assumes top-2 + n_group = getattr(text_config, 'n_group', None) + num_experts = n_group * 2 if n_group is not None else 2 + + # we use three layers for Gemma-3n to check + # both normal layer and kv_shared_layer + num_hidden_layers = (3 if model_arch == "Gemma3nForConditionalGeneration" + else 1) + text_config.update({ + "num_layers": 1, + "num_hidden_layers": num_hidden_layers, + "num_experts": num_experts, + "num_experts_per_tok": 2, + "num_local_experts": num_experts, + # Otherwise there will not be any expert layers + "first_k_dense_replace": 0, + # To avoid OOM on DeepSeek-V3 + "n_routed_experts": num_experts, + # For Gemma-3n + "num_kv_shared_layers": 1, + }) + + if hasattr(hf_config, "vision_config"): + hf_config.vision_config.update({ + "num_layers": 1, + "num_hidden_layers": 1, + }) + + # e.g.: ibm-granite/granite-speech-3.3-2b + if hasattr(hf_config, "encoder_config"): + hf_config.encoder_config.update({ + "num_layers": 1, + "num_hidden_layers": 1, + }) + + # e.g.: Qwen/Qwen2-Audio-7B-Instruct + if hasattr(hf_config, "audio_config"): + hf_config.audio_config.update({ + "num_layers": 1, + "num_hidden_layers": 1, + "encoder_layers": 1, + }) + + return hf_config From 178d03fbd64e18999647b349623cd1489f816c8c Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 6 Aug 2025 01:08:49 -0700 Subject: [PATCH 020/932] [gpt-oss] Add Tool/ConversationContext classes and harmony_utils (#22340) Signed-off-by: Woosuk Kwon Co-authored-by: LiuXiaoxuanPKU Co-authored-by: simon-mo Co-authored-by: Chen Zhang Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com> Co-authored-by: Yongye Zhu --- vllm/entrypoints/context.py | 177 ++++++++++++++++++++++++++++++ vllm/entrypoints/harmony_utils.py | 111 +++++++++++++++++++ vllm/entrypoints/tool.py | 87 +++++++++++++++ 3 files changed, 375 insertions(+) create mode 100644 vllm/entrypoints/context.py create mode 100644 vllm/entrypoints/harmony_utils.py create mode 100644 vllm/entrypoints/tool.py diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py new file mode 100644 index 0000000000..6292306e7c --- /dev/null +++ b/vllm/entrypoints/context.py @@ -0,0 +1,177 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import logging +from abc import ABC, abstractmethod + +from openai_harmony import Message, Role, StreamState + +from vllm.entrypoints.harmony_utils import ( + get_encoding, get_streamable_parser_for_assistant, render_for_completion) +from vllm.entrypoints.tool import Tool +from vllm.outputs import RequestOutput + +logger = logging.getLogger(__name__) + + +class ConversationContext(ABC): + + @abstractmethod + def append_output(self, output) -> None: + pass + + @abstractmethod + async def call_tool(self) -> list[Message]: + pass + + @abstractmethod + def need_builtin_tool_call(self) -> bool: + pass + + @abstractmethod + def render_for_completion(self) -> list[int]: + pass + + +class SimpleContext(ConversationContext): + + def __init__(self): + self.last_output = None + + def append_output(self, output) -> None: + self.last_output = output + + def need_builtin_tool_call(self) -> bool: + return False + + async def call_tool(self) -> list[Message]: + raise NotImplementedError("Should not be called.") + + def render_for_completion(self) -> list[int]: + raise NotImplementedError("Should not be called.") + + +class HarmonyContext(ConversationContext): + + def __init__( + self, + messages: list, + tool_sessions: dict[str, Tool], + ): + self._messages = messages + self.tool_sessions = tool_sessions + + self.parser = get_streamable_parser_for_assistant() + self.num_init_messages = len(messages) + # TODO(woosuk): Implement the following fields. + self.num_prompt_tokens = 0 + self.num_cached_tokens = 0 + self.num_output_tokens = 0 + self.num_reasoning_tokens = 0 + + def append_output(self, output) -> None: + if isinstance(output, RequestOutput): + output_token_ids = output.outputs[0].token_ids + for token_id in output_token_ids: + self.parser.process(token_id) + output_msgs = self.parser.messages + else: + # Tool output. + output_msgs = output + self._messages.extend(output_msgs) + + @property + def messages(self) -> list: + return self._messages + + def need_builtin_tool_call(self) -> bool: + last_msg = self.messages[-1] + recipient = last_msg.recipient + return recipient is not None and (recipient.startswith("browser.") + or recipient.startswith("python")) + + async def call_tool(self) -> list[Message]: + if not self.messages: + return [] + last_msg = self.messages[-1] + recipient = last_msg.recipient + if recipient is not None: + if recipient.startswith("browser."): + return await self.call_search_tool( + self.tool_sessions["browser"], last_msg) + elif recipient.startswith("python"): + return await self.call_python_tool( + self.tool_sessions["python"], last_msg) + raise ValueError("No tool call found") + + def render_for_completion(self) -> list[int]: + return render_for_completion(self.messages) + + async def call_search_tool( + self, + tool_session: Tool, + last_msg: Message, + ) -> list[Message]: + return await tool_session.get_result(self) + + async def call_python_tool( + self, + tool_session: Tool, + last_msg: Message, + ) -> list[Message]: + return await tool_session.get_result(self) + + +class StreamingHarmonyContext(HarmonyContext): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.last_output = None + + self.parser = get_streamable_parser_for_assistant() + self.encoding = get_encoding() + self.last_tok = None + + @property + def messages(self) -> list: + return self.parser.messages + + def append_output(self, output) -> None: + if isinstance(output, RequestOutput): + tok = output.outputs[0].token_ids[0] + self.parser.process(tok) + self.last_tok = tok + else: + # Handle the case of tool output in direct message format + assert len(output) == 1, "Tool output should be a single message" + msg = output[0] + # Sometimes the recipient is not set for tool messages, + # so we set it to "assistant" + if msg.author.role == Role.TOOL and msg.recipient is None: + msg.recipient = "assistant" + toks = self.encoding.render(msg) + for tok in toks: + self.parser.process(tok) + self.last_tok = toks[-1] + + def is_expecting_start(self) -> bool: + return self.parser.state == StreamState.EXPECT_START + + def is_assistant_action_turn(self) -> bool: + return self.last_tok in self.encoding.stop_tokens_for_assistant_actions( + ) + + def render_for_completion(self) -> list[int]: + # now this list of tokens as next turn's starting tokens + # `<|start|>assistant``, + # we need to process them in parser. + rendered_tokens = super().render_for_completion() + + last_n = -1 + to_process = [] + while rendered_tokens[last_n] != self.last_tok: + to_process.append(rendered_tokens[last_n]) + last_n -= 1 + for tok in reversed(to_process): + self.parser.process(tok) + + return rendered_tokens diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py new file mode 100644 index 0000000000..801c82b4fa --- /dev/null +++ b/vllm/entrypoints/harmony_utils.py @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import datetime +from typing import Literal, Optional + +from openai.types.responses.tool import Tool +from openai_harmony import (Conversation, DeveloperContent, + HarmonyEncodingName, Message, ReasoningEffort, + Role, StreamableParser, SystemContent, TextContent, + ToolDescription, load_harmony_encoding) + +REASONING_EFFORT = { + "high": ReasoningEffort.HIGH, + "medium": ReasoningEffort.MEDIUM, + "low": ReasoningEffort.LOW, +} + +_harmony_encoding = None + + +def get_encoding(): + global _harmony_encoding + if _harmony_encoding is None: + _harmony_encoding = load_harmony_encoding( + HarmonyEncodingName.HARMONY_GPT_OSS) + return _harmony_encoding + + +def get_system_message( + model_identity: Optional[str] = None, + reasoning_effort: Optional[Literal["high", "medium", "low"]] = None, + start_date: Optional[str] = None, + browser_description: Optional[str] = None, + python_description: Optional[str] = None, +) -> Message: + sys_msg_content = SystemContent.new() + if model_identity is not None: + sys_msg_content = sys_msg_content.with_model_identity(model_identity) + if reasoning_effort is not None: + sys_msg_content = sys_msg_content.with_reasoning_effort( + REASONING_EFFORT[reasoning_effort]) + if start_date is None: + # NOTE(woosuk): This brings non-determinism in vLLM. Be careful. + start_date = datetime.datetime.now().strftime("%Y-%m-%d") + sys_msg_content = sys_msg_content.with_conversation_start_date(start_date) + if browser_description is not None: + sys_msg_content = sys_msg_content.with_tools(browser_description) + if python_description is not None: + sys_msg_content = sys_msg_content.with_tools(python_description) + sys_msg = Message.from_role_and_content(Role.SYSTEM, sys_msg_content) + return sys_msg + + +def get_developer_message(instructions: Optional[str] = None, + tools: Optional[list[Tool]] = None) -> Message: + dev_msg_content = DeveloperContent.new() + if instructions is not None: + dev_msg_content = dev_msg_content.with_instructions(instructions) + if tools is not None: + function_tools = [] + for tool in tools: + if tool.type in ("web_search_preview", "code_interpreter"): + # These are built-in tools that are added to the system message. + pass + elif tool.type == "function": + function_tools.append(tool) + else: + raise ValueError(f"tool type {tool.type} not supported") + if function_tools: + function_tool_descriptions = [ + ToolDescription.new( + name=tool.name, + description=tool.description, + parameters=tool.parameters, + ) for tool in function_tools + ] + dev_msg_content = dev_msg_content.with_function_tools( + function_tool_descriptions) + dev_msg = Message.from_role_and_content(Role.DEVELOPER, dev_msg_content) + return dev_msg + + +def get_user_message(content: str) -> Message: + return Message.from_role_and_content(Role.USER, content) + + +def parse_chat_input(chat_msg) -> Message: + role = chat_msg["role"] + content = chat_msg["content"] + if isinstance(content, str): + contents = [TextContent(text=content)] + else: + # TODO: Support refusal. + contents = [TextContent(text=c["text"]) for c in content] + msg = Message.from_role_and_contents(role, contents) + return msg + + +def render_for_completion(messages: list[Message]) -> list[int]: + conversation = Conversation.from_messages(messages) + token_ids = get_encoding().render_conversation_for_completion( + conversation, Role.ASSISTANT) + return token_ids + + +def get_stop_tokens_for_assistant_actions() -> list[int]: + return get_encoding().stop_tokens_for_assistant_actions() + + +def get_streamable_parser_for_assistant() -> StreamableParser: + return StreamableParser(get_encoding(), role=Role.ASSISTANT) diff --git a/vllm/entrypoints/tool.py b/vllm/entrypoints/tool.py new file mode 100644 index 0000000000..01ee77414f --- /dev/null +++ b/vllm/entrypoints/tool.py @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +from vllm.logger import init_logger + +if TYPE_CHECKING: + # Avoid circular import. + from vllm.entrypoints.context import ConversationContext + +logger = init_logger(__name__) + + +class Tool(ABC): + + @abstractmethod + async def get_result(self, context: "ConversationContext") -> Any: + pass + + +class HarmonyBrowserTool(Tool): + + def __init__(self): + self.enabled = True + exa_api_key = os.getenv("EXA_API_KEY") + if not exa_api_key: + self.enabled = False + logger.warning_once("EXA_API_KEY is not set, browsing is disabled") + return + + try: + from gpt_oss.tools.simple_browser import SimpleBrowserTool + from gpt_oss.tools.simple_browser.backend import ExaBackend + except ImportError: + self.enabled = False + logger.warning_once( + "gpt_oss is not installed, browsing is disabled") + return + + browser_backend = ExaBackend(source="web", api_key=exa_api_key) + self.browser_tool = SimpleBrowserTool(backend=browser_backend) + logger.info_once("Browser tool initialized") + + async def get_result(self, context: "ConversationContext") -> Any: + from vllm.entrypoints.context import HarmonyContext + assert isinstance(context, HarmonyContext) + last_msg = context.messages[-1] + tool_output_msgs = [] + async for msg in self.browser_tool.process(last_msg): + tool_output_msgs.append(msg) + return tool_output_msgs + + @property + def tool_config(self) -> Any: + return self.browser_tool.tool_config + + +class HarmonyPythonTool(Tool): + + def __init__(self): + self.enabled = True + + try: + from gpt_oss.tools.python_docker.docker_tool import PythonTool + except ImportError: + self.enabled = False + logger.warning_once( + "gpt_oss is not installed, code interpreter is disabled") + return + + self.python_tool = PythonTool() + logger.info_once("Code interpreter tool initialized") + + async def get_result(self, context: "ConversationContext") -> Any: + from vllm.entrypoints.context import HarmonyContext + assert isinstance(context, HarmonyContext) + last_msg = context.messages[-1] + tool_output_msgs = [] + async for msg in self.python_tool.process(last_msg): + tool_output_msgs.append(msg) + return tool_output_msgs + + @property + def tool_config(self) -> Any: + return self.python_tool.tool_config From 54991c548a87392c0c1375e902db1f2ad71c105a Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 6 Aug 2025 01:49:44 -0700 Subject: [PATCH 021/932] [gpt-oss] add model to supported models doc (#22336) Signed-off-by: Roger Wang --- docs/models/supported_models.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 017a339ffc..120fd3f485 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -356,6 +356,7 @@ th { | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ | | `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | ✅︎ | +| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | | ✅︎ | | `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ | From f263a4b53fb4070460f3d82538600cf667516d06 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 6 Aug 2025 01:57:39 -0700 Subject: [PATCH 022/932] [gpt-oss] Support chat completion api (#22342) --- vllm/entrypoints/harmony_utils.py | 34 +++++ vllm/entrypoints/openai/protocol.py | 4 + vllm/entrypoints/openai/serving_chat.py | 169 ++++++++++++++++++++---- 3 files changed, 183 insertions(+), 24 deletions(-) diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index 801c82b4fa..c1b0a084f3 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import datetime +from collections.abc import Iterable from typing import Literal, Optional from openai.types.responses.tool import Tool @@ -109,3 +110,36 @@ def get_stop_tokens_for_assistant_actions() -> list[int]: def get_streamable_parser_for_assistant() -> StreamableParser: return StreamableParser(get_encoding(), role=Role.ASSISTANT) + + +def parse_output_into_messages(token_ids: Iterable[int]) -> StreamableParser: + parser = get_streamable_parser_for_assistant() + for token_id in token_ids: + parser.process(token_id) + return parser + + +def parse_chat_output( + token_ids: list[int]) -> tuple[Optional[str], Optional[str], bool]: + parser = parse_output_into_messages(token_ids) + output_msgs = parser.messages + if len(output_msgs) == 0: + # The generation has stopped during reasoning. + is_tool_call = False + reasoning_content = parser.current_content + final_content = None + elif len(output_msgs) == 1: + # The generation has stopped during final message. + is_tool_call = False + reasoning_content = output_msgs[0].content[0].text + final_content = parser.current_content + else: + if len(output_msgs) != 2: + raise ValueError( + "Expected 2 output messages (reasoning and final), " + f"but got {len(output_msgs)}.") + reasoning_msg, final_msg = output_msgs + reasoning_content = reasoning_msg.content[0].text + final_content = final_msg.content[0].text + is_tool_call = final_msg.recipient is not None + return reasoning_content, final_content, is_tool_call diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 64f2beb140..57aa427207 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -323,6 +323,7 @@ class ResponsesRequest(OpenAIBaseModel): if (top_p := self.top_p) is None: top_p = default_sampling_params.get( "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]) + stop_token_ids = default_sampling_params.get("stop_token_ids") # Structured output guided_decoding = None @@ -340,6 +341,7 @@ class ResponsesRequest(OpenAIBaseModel): top_p=top_p, max_tokens=max_tokens, logprobs=self.top_logprobs, + stop_token_ids=stop_token_ids, output_kind=(RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY), guided_decoding=guided_decoding, @@ -404,6 +406,8 @@ class ChatCompletionRequest(OpenAIBaseModel): Literal["required"], ChatCompletionNamedToolChoiceParam, ]] = "none" + reasoning_effort: Optional[Literal["low", "medium", "high"]] = None + include_reasoning: bool = True # NOTE this will be ignored by vLLM -- the model determines the behavior parallel_tool_calls: Optional[bool] = False diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index e1d8a31672..6ad0a8ec54 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -12,6 +12,7 @@ import jinja2 import partial_json_parser import regex as re from fastapi import Request +from openai_harmony import Message as OpenAIMessage from pydantic import TypeAdapter from vllm.config import ModelConfig @@ -19,6 +20,10 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, ConversationMessage, random_tool_call_id) +from vllm.entrypoints.harmony_utils import ( + get_developer_message, get_stop_tokens_for_assistant_actions, + get_streamable_parser_for_assistant, get_system_message, parse_chat_input, + parse_chat_output, render_for_completion) from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import ( ChatCompletionLogProb, ChatCompletionLogProbs, @@ -35,6 +40,7 @@ from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import ( MistralToolCall) from vllm.entrypoints.utils import get_max_tokens +from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.logger import init_logger from vllm.outputs import CompletionOutput, RequestOutput from vllm.reasoning import ReasoningParser, ReasoningParserManager @@ -125,6 +131,23 @@ class OpenAIServingChat(OpenAIServing): logger.info("Using default chat sampling params from %s: %s", source, self.default_sampling_params) + self.use_harmony = model_config.hf_config.model_type == "gpt_oss" + if self.use_harmony: + if "stop_token_ids" not in self.default_sampling_params: + self.default_sampling_params["stop_token_ids"] = [] + self.default_sampling_params["stop_token_ids"].extend( + get_stop_tokens_for_assistant_actions()) + + # NOTE(woosuk): While OpenAI's chat completion API supports browsing + # for some models, currently vLLM doesn't support it. Please use the + # Responses API instead. + self.supports_browsing = False + self.browser_tool = None + # NOTE(woosuk): Chat completion API does not support code interpreter. + # Please use the Responses API instead. + self.supports_code_interpreter = False + self.python_tool = None + async def create_chat_completion( self, request: ChatCompletionRequest, @@ -169,7 +192,8 @@ class OpenAIServingChat(OpenAIServing): if (request.tool_choice == "auto" and not (self.enable_auto_tools and tool_parser is not None) - and not isinstance(tokenizer, MistralTokenizer)): + and not isinstance(tokenizer, MistralTokenizer) + and not self.use_harmony): # for hf tokenizers, "auto" tools requires # --enable-auto-tool-choice and --tool-call-parser return self.create_error_response( @@ -184,25 +208,35 @@ class OpenAIServingChat(OpenAIServing): else: tool_dicts = [tool.model_dump() for tool in request.tools] - ( - conversation, - request_prompts, - engine_prompts, - ) = await self._preprocess_chat( - request, - tokenizer, - request.messages, - chat_template=request.chat_template or self.chat_template, - chat_template_content_format=self.chat_template_content_format, - add_generation_prompt=request.add_generation_prompt, - continue_final_message=request.continue_final_message, - tool_dicts=tool_dicts, - documents=request.documents, - chat_template_kwargs=request.chat_template_kwargs, - tool_parser=tool_parser, - truncate_prompt_tokens=request.truncate_prompt_tokens, - add_special_tokens=request.add_special_tokens, - ) + if not self.use_harmony: + # Common case. + ( + conversation, + request_prompts, + engine_prompts, + ) = await self._preprocess_chat( + request, + tokenizer, + request.messages, + chat_template=request.chat_template or self.chat_template, + chat_template_content_format=self. + chat_template_content_format, + add_generation_prompt=request.add_generation_prompt, + continue_final_message=request.continue_final_message, + tool_dicts=tool_dicts, + documents=request.documents, + chat_template_kwargs=request.chat_template_kwargs, + tool_parser=tool_parser, + truncate_prompt_tokens=request.truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + ) + else: + # For GPT-OSS. + ( + conversation, + request_prompts, + engine_prompts, + ) = self._make_request_with_harmony(request) except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: logger.exception("Error in preprocessing prompt inputs") @@ -436,6 +470,11 @@ class OpenAIServingChat(OpenAIServing): finish_reason_sent = [False] * num_choices num_prompt_tokens = 0 num_cached_tokens = None + if self.use_harmony: + harmony_parsers = [ + get_streamable_parser_for_assistant() + for _ in range(num_choices) + ] if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam): tool_choice_function_name = request.tool_choice.function.name @@ -597,7 +636,18 @@ class OpenAIServingChat(OpenAIServing): else: logprobs = None - delta_text = output.text + if self.use_harmony: + harmony_parser = harmony_parsers[i] + for token_id in output.token_ids: + harmony_parser.process(token_id) + # FIXME(woosuk): Support function calling + is_final = harmony_parser.current_channel == "final" + if not (request.include_reasoning or is_final): + # Skip the reasoning content. + continue + delta_text = harmony_parser.last_content_delta or "" + else: + delta_text = output.text if not delta_text and not output.token_ids and \ not previous_num_tokens[i]: @@ -607,7 +657,8 @@ class OpenAIServingChat(OpenAIServing): delta_message: Optional[DeltaMessage] # just update previous_texts and previous_token_ids - if tool_choice_auto or self.reasoning_parser: + if ((tool_choice_auto or self.reasoning_parser) + and not self.use_harmony): assert previous_texts is not None assert all_previous_token_ids is not None previous_text = previous_texts[i] @@ -621,8 +672,14 @@ class OpenAIServingChat(OpenAIServing): else: current_token_ids = list(output.token_ids) + if self.use_harmony: + if is_final: + delta_message = DeltaMessage(content=delta_text) + else: + delta_message = DeltaMessage( + reasoning_content=delta_text) # handle streaming deltas for tools with named tool_choice - if tool_choice_function_name: + elif tool_choice_function_name: if (self.reasoning_parser and not reasoning_end_arr[i] and not reasoning_parser.is_reasoning_end( previous_token_ids)): @@ -990,7 +1047,38 @@ class OpenAIServingChat(OpenAIServing): ) else: logprobs = None - auto_tools_called = False + + if self.use_harmony: + reasoning_content, final_content, is_tool_call = ( + parse_chat_output(token_ids)) + if not request.include_reasoning: + reasoning_content = None + + if is_tool_call: + # TODO(woosuk): Implement tool call for gpt-oss. + # For now, only Responses API supports tool call for + # gpt-oss. + raise NotImplementedError( + "Tool call in Chat Completion API is not supported " + "for gpt-oss yet. Please use Responses API instead.") + else: + # Normal message + message = ChatMessage( + role=role, + reasoning_content=reasoning_content, + content=final_content, + ) + + choice_data = ChatCompletionResponseChoice( + index=output.index, + message=message, + logprobs=logprobs, + finish_reason="tool_calls" if is_tool_call else + output.finish_reason if output.finish_reason else "stop", + stop_reason=output.stop_reason, + ) + choices.append(choice_data) + continue if self.reasoning_parser: try: @@ -1003,10 +1091,13 @@ class OpenAIServingChat(OpenAIServing): reasoning_content, content = ( reasoning_parser.extract_reasoning_content( output.text, request=request)) + if not request.include_reasoning: + reasoning_content = None else: reasoning_content = None content = output.text + auto_tools_called = False # if auto tools are not enabled, and a named tool choice using # outlines is not being used if (not self.enable_auto_tools or not self.tool_parser) and \ @@ -1261,3 +1352,33 @@ class OpenAIServingChat(OpenAIServing): and delta_message.tool_calls[0].function and delta_message.tool_calls[0].function.arguments is not None ) + + def _make_request_with_harmony( + self, + request: ChatCompletionRequest, + ): + messages: list[OpenAIMessage] = [] + + # Add system message. + # NOTE: In Chat Completion API, browsing is enabled by default + # if the model supports it. TODO: Support browsing. + assert not self.supports_browsing + assert not self.supports_code_interpreter + sys_msg = get_system_message( + reasoning_effort=request.reasoning_effort, + browser_description=None, + python_description=None) + messages.append(sys_msg) + + # Add developer message. + dev_msg = get_developer_message() + messages.append(dev_msg) + + # Add user message. + for chat_msg in request.messages: + messages.append(parse_chat_input(chat_msg)) + + # Render prompt token ids. + prompt_token_ids = render_for_completion(messages) + engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids) + return messages, [prompt_token_ids], [engine_prompt] From 9edd1db02bc6dce6da503503a373657f3466a78b Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 6 Aug 2025 02:22:03 -0700 Subject: [PATCH 023/932] [Minor] Fix type (#22347) Signed-off-by: Woosuk Kwon --- vllm/entrypoints/harmony_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index c1b0a084f3..ecda35c980 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import datetime -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from typing import Literal, Optional from openai.types.responses.tool import Tool @@ -120,7 +120,7 @@ def parse_output_into_messages(token_ids: Iterable[int]) -> StreamableParser: def parse_chat_output( - token_ids: list[int]) -> tuple[Optional[str], Optional[str], bool]: + token_ids: Sequence[int]) -> tuple[Optional[str], Optional[str], bool]: parser = parse_output_into_messages(token_ids) output_msgs = parser.messages if len(output_msgs) == 0: From 2cb6ef8996320273705933d5b24fc6674eb95de8 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 6 Aug 2025 11:03:03 -0400 Subject: [PATCH 024/932] [BugFix] Fix FA2 RuntimeError when sinks is provided (#22365) Signed-off-by: LucasWilkinson --- cmake/external_projects/vllm_flash_attn.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index 4eb4b464a2..59b99e9e20 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG b99f8c821771fd11feb66d5c89661e9858fde359 + GIT_TAG 6dbc6e011a3ebe9349eeb74578940dd7095436ba GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn From b4b9813b5e2076d510ae518252f64064e6646a3e Mon Sep 17 00:00:00 2001 From: Zhang Jason Date: Wed, 6 Aug 2025 23:58:38 +0800 Subject: [PATCH 025/932] add the codes to check AMD Instinct GPU number (#22367) Signed-off-by: Zhang Jason --- .../disagg_prefill_lmcache_v1/disagg_example_nixl.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh index 1178681f15..a409c49b5d 100644 --- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh +++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh @@ -21,8 +21,14 @@ check_hf_token() { } check_num_gpus() { - # can you check if the number of GPUs are >=2 via nvidia-smi? - num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) + # can you check if the number of GPUs are >=2 via nvidia-smi/rocm-smi? + which rocm-smi > /dev/null 2>&1 + if [ $? -ne 0 ]; then + num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) + else + num_gpus=$(rocm-smi --showid | grep Instinct | wc -l) + fi + if [ "$num_gpus" -lt 2 ]; then echo "You need at least 2 GPUs to run disaggregated prefill." exit 1 From 4a6b72c2ab9848af31d51d3105a1992b7d5a01dc Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 6 Aug 2025 12:47:38 -0400 Subject: [PATCH 026/932] [BugFix] Fix triton compile error in `kernel_unified_attention_2/3d` caused by attention sinks (#22368) Signed-off-by: LucasWilkinson --- .../attention/ops/triton_unified_attention.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index ba4299a277..56ebed0f52 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -75,6 +75,7 @@ def kernel_unified_attention_2d( USE_ALIBI_SLOPES: tl.constexpr, # bool USE_QQ_BIAS: tl.constexpr, # bool USE_SOFTCAP: tl.constexpr, # bool + USE_SINKS: tl.constexpr, # bool SLIDING_WINDOW: tl.constexpr, # int stride_k_cache_0: tl.int64, # int stride_k_cache_1: tl.int64, # int @@ -132,7 +133,7 @@ def kernel_unified_attention_2d( block_table_offset = seq_idx * block_table_stride - if sink_ptr is None: + if not USE_SINKS: M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) else: M = tl.load( @@ -322,6 +323,7 @@ def kernel_unified_attention_3d( USE_ALIBI_SLOPES: tl.constexpr, # bool USE_QQ_BIAS: tl.constexpr, # bool USE_SOFTCAP: tl.constexpr, # bool + USE_SINKS: tl.constexpr, # bool SLIDING_WINDOW: tl.constexpr, # int stride_k_cache_0: tl.int64, # int stride_k_cache_1: tl.int64, # int @@ -393,14 +395,17 @@ def kernel_unified_attention_3d( block_table_offset = seq_idx * block_table_stride - if sink_ptr is None or segm_idx != 0: - M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + if USE_SINKS: + if segm_idx == 0: + M = tl.load( + sink_ptr + query_offset_1, + mask=query_mask_1, + other=float("-inf"), + ).to(dtype=tl.float32) + else: + M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) else: - M = tl.load( - sink_ptr + query_offset_1, - mask=query_mask_1, - other=float("-inf"), - ).to(dtype=tl.float32) + M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) L = tl.full([BLOCK_M], 1.0, dtype=tl.float32) acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32) @@ -716,6 +721,7 @@ def unified_attention( USE_ALIBI_SLOPES=use_alibi_slopes, USE_QQ_BIAS=use_qq_bias, USE_SOFTCAP=(softcap > 0), + USE_SINKS=(sinks is not None), SLIDING_WINDOW=(1 + window_size[0]), stride_k_cache_0=k.stride(0), stride_k_cache_1=k.stride(1), @@ -787,6 +793,7 @@ def unified_attention( USE_ALIBI_SLOPES=use_alibi_slopes, USE_QQ_BIAS=use_qq_bias, USE_SOFTCAP=(softcap > 0), + USE_SINKS=(sinks is not None), SLIDING_WINDOW=(1 + window_size[0]), stride_k_cache_0=k.stride(0), stride_k_cache_1=k.stride(1), From 2435ea7ed5c3a7d058cc6f6d649316e96976acaa Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Wed, 6 Aug 2025 13:00:58 -0400 Subject: [PATCH 027/932] [Bugfix] Make condition in triton kernel constexpr (#22370) Signed-off-by: Gregory Shtrasberg --- vllm/attention/ops/chunked_prefill_paged_decode.py | 4 +++- vllm/attention/ops/prefix_prefill.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py index 08bfcc974c..dc10d7eca9 100644 --- a/vllm/attention/ops/chunked_prefill_paged_decode.py +++ b/vllm/attention/ops/chunked_prefill_paged_decode.py @@ -60,6 +60,7 @@ def kernel_paged_attention_2d( stride_v_cache_3: tl.int64, # int filter_by_query_len: tl.constexpr, # bool query_start_len_ptr, # [num_seqs+1] + USE_SINKS: tl.constexpr, # bool ): seq_idx = tl.program_id(0) kv_head_idx = tl.program_id(1) @@ -96,7 +97,7 @@ def kernel_paged_attention_2d( block_table_offset = seq_idx * block_table_stride - if sink_ptr is None: + if not USE_SINKS: M = tl.full([num_queries_per_kv_padded], float("-inf"), dtype=tl.float32) @@ -386,4 +387,5 @@ def chunked_prefill_paged_decode( stride_v_cache_3=value_cache.stride(3), filter_by_query_len=True, query_start_len_ptr=query_start_loc, + USE_SINKS=sinks is not None, ) diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index 64c9033797..e1d41930f6 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -81,6 +81,7 @@ def _fwd_kernel(Q, num_unroll_cache: tl.constexpr, num_unroll_request: tl.constexpr, SKIP_DECODE: tl.constexpr, + USE_SINKS: tl.constexpr, MAX_Q_LEN: tl.constexpr = 0, MAX_CTX_LEN: tl.constexpr = 0): @@ -127,7 +128,7 @@ def _fwd_kernel(Q, other=0.0) # [M,D] # initialize pointer to m and l - if sink_ptr is None: + if not USE_SINKS: m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) else: m_i = tl.load( @@ -910,5 +911,6 @@ def context_attention_fwd(q, num_unroll_request=1, num_warps=4, num_stages=1, + USE_SINKS=sinks is not None, **extra_kargs) return From ec7cb1922478015b4e7eae73c6acde8b598a05a8 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 6 Aug 2025 10:32:21 -0700 Subject: [PATCH 028/932] [gpt-oss] Add loop for built-in tool call (#22374) Signed-off-by: Woosuk Kwon Co-authored-by: LiuXiaoxuanPKU Co-authored-by: simon-mo Co-authored-by: Chen Zhang Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com> Co-authored-by: Yongye Zhu --- vllm/entrypoints/openai/serving_engine.py | 56 ++++++++++++++++++++ vllm/entrypoints/openai/serving_responses.py | 33 ++++++------ 2 files changed, 73 insertions(+), 16 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 71976fea1e..822f186840 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -35,6 +35,7 @@ from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, apply_mistral_chat_template, parse_chat_messages_futures, resolve_chat_template_content_format) +from vllm.entrypoints.context import ConversationContext from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, ChatCompletionResponse, @@ -948,6 +949,61 @@ class OpenAIServing: return conversation, [request_prompt], [engine_prompt] + async def _generate_with_builtin_tools( + self, + request_id: str, + request_prompt: RequestPrompt, + engine_prompt: EngineTokensPrompt, + sampling_params: SamplingParams, + context: ConversationContext, + lora_request: Optional[LoRARequest] = None, + priority: int = 0, + **kwargs, + ): + orig_priority = priority + while True: + self._log_inputs( + request_id, + request_prompt, + params=sampling_params, + lora_request=lora_request, + ) + generator = self.engine_client.generate( + engine_prompt, + sampling_params, + request_id, + lora_request=lora_request, + priority=priority, + **kwargs, + ) + async for res in generator: + context.append_output(res) + # NOTE(woosuk): The stop condition is handled by the engine. + yield context + + if not context.need_builtin_tool_call(): + # The model did not ask for a tool call, so we're done. + break + + # Call the tool and update the context with the result. + tool_output = await context.call_tool() + context.append_output(tool_output) + + # TODO: uncomment this and enable tool output streaming + # yield context + + # Create inputs for the next turn. + # Render the next prompt token ids. + prompt_token_ids = context.render_for_completion() + engine_prompt = EngineTokensPrompt( + prompt_token_ids=prompt_token_ids) + request_prompt = prompt_token_ids + # Update the sampling params. + sampling_params.max_tokens = (self.max_model_len - + len(prompt_token_ids)) + # OPTIMIZATION + priority = orig_priority - 1 + def _load_prompt_embeds( self, prompt_embeds: Optional[Union[bytes, list[bytes]]], diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index e009529fbd..f340854386 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -16,6 +16,7 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, ChatTemplateContentFormatOption) +from vllm.entrypoints.context import ConversationContext, SimpleContext from vllm.entrypoints.logger import RequestLogger # yapf conflicts with isort for this block # yapf: disable @@ -29,7 +30,6 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.logger import init_logger -from vllm.outputs import RequestOutput from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -187,7 +187,7 @@ class OpenAIServingResponses(OpenAIServing): raw_request.state.request_metadata = request_metadata # Schedule the request and get the result generator. - generators: list[AsyncGenerator[RequestOutput, None]] = [] + generators: list[AsyncGenerator[ConversationContext, None]] = [] try: for i, engine_prompt in enumerate(engine_prompts): default_max_tokens = self.max_model_len - len( @@ -195,21 +195,19 @@ class OpenAIServingResponses(OpenAIServing): sampling_params = request.to_sampling_params( default_max_tokens, self.default_sampling_params) - self._log_inputs(request.request_id, - request_prompts[i], - params=sampling_params, - lora_request=lora_request) - trace_headers = (None if raw_request is None else await self._get_trace_headers(raw_request.headers)) - generator = self.engine_client.generate( - engine_prompt, - sampling_params, - request.request_id, + context = SimpleContext() + generator = self._generate_with_builtin_tools( + request_id=request.request_id, + request_prompt=request_prompts[i], + engine_prompt=engine_prompt, + sampling_params=sampling_params, + context=context, lora_request=lora_request, - trace_headers=trace_headers, priority=request.priority, + trace_headers=trace_headers, ) generators.append(generator) except ValueError as e: @@ -277,7 +275,7 @@ class OpenAIServingResponses(OpenAIServing): self, request: ResponsesRequest, sampling_params: SamplingParams, - result_generator: AsyncIterator[RequestOutput], + result_generator: AsyncIterator[ConversationContext], model_name: str, tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, @@ -285,17 +283,20 @@ class OpenAIServingResponses(OpenAIServing): ) -> Union[ErrorResponse, ResponsesResponse]: if created_time is None: created_time = int(time.time()) - final_res: Optional[RequestOutput] = None + context: Optional[ConversationContext] = None try: - async for res in result_generator: - final_res = res + async for context in result_generator: + pass except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) + assert context is not None + assert isinstance(context, SimpleContext) + final_res = context.last_output assert final_res is not None assert len(final_res.outputs) == 1 final_output = final_res.outputs[0] From 31f5dc5b2a5da18bc17240c7a67e8770d00901d8 Mon Sep 17 00:00:00 2001 From: Yongye Zhu Date: Wed, 6 Aug 2025 11:41:42 -0700 Subject: [PATCH 029/932] [gpt-oss] Enhance error msg on attention sink init (#22335) Signed-off-by: simon-mo Signed-off-by: Yongye Zhu Co-authored-by: simon-mo --- vllm/v1/attention/backends/flashinfer.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index caf9ecc911..061bd5f1d2 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -638,11 +638,15 @@ class FlashInferImpl(AttentionImpl): self.sinks: Optional[torch.Tensor] = None if sinks is not None: - assert sinks.shape[0] == num_heads, ( - "Sinks must have the same number of heads " - "as the number of heads in the layer" - ) - assert sinks.dtype == torch.float32, "Sinks must be of type float32" + if sinks.shape[0] != num_heads: + raise ValueError( + "Sinks must have the same number of heads as the number of " + f"heads in the layer. Expected {num_heads}, but got " + f"{sinks.shape[0]}." + ) + if sinks.dtype != torch.float32: + raise ValueError("Sinks must be of type float32, but got " + f"{sinks.dtype}.") self.sinks = sinks def forward( From 31f09c615f4f067dba765ce5fe7d00d880212a6d Mon Sep 17 00:00:00 2001 From: Yongye Zhu Date: Wed, 6 Aug 2025 12:37:27 -0700 Subject: [PATCH 030/932] [gpt-oss] flashinfer mxfp4 (#22339) Signed-off-by: simon-mo Signed-off-by: Yongye Zhu Co-authored-by: simon-mo --- vllm/envs.py | 12 + vllm/model_executor/layers/fused_moe/layer.py | 32 +- .../layers/quantization/__init__.py | 3 + .../layers/quantization/mxfp4.py | 387 ++++++++++++++++++ .../layers/quantization/utils/mxfp4_utils.py | 22 + 5 files changed, 453 insertions(+), 3 deletions(-) create mode 100644 vllm/model_executor/layers/quantization/mxfp4.py diff --git a/vllm/envs.py b/vllm/envs.py index f8a7197dd1..8a3eb8e509 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -154,6 +154,8 @@ if TYPE_CHECKING: VLLM_ENABLE_RESPONSES_API_STORE: bool = False VLLM_USE_TRTLLM_CONTEXT_ATTENTION: bool = False VLLM_USE_TRTLLM_DECODE_ATTENTION: bool = False + VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False + VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False def get_default_cache_root(): @@ -932,6 +934,16 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_FLASHINFER_MOE_FP4": lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))), + # If set to 1, use the FlashInfer + # MXFP8 (activation) x MXFP4 (weight) MoE backend. + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": + lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"))), + + # If set to 1, use the FlashInfer + # BF16 (activation) x MXFP4 (weight) MoE backend. + "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16": + lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"))), + # Control the cache sized used by the xgrammar compiler. The default # of 512 MB should be enough for roughly 1000 JSON schemas. # It can be changed with this variable if needed for some reason. diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index f155a1b11f..a4a6157fa4 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -33,7 +33,8 @@ from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum -from vllm.utils import direct_register_custom_op, has_deep_ep, has_pplx +from vllm.utils import (direct_register_custom_op, has_deep_ep, has_pplx, + round_up) from vllm.utils.flashinfer import has_flashinfer if current_platform.is_cuda_alike(): @@ -719,6 +720,12 @@ class FusedMoE(torch.nn.Module): self.global_num_experts = num_experts + num_redundant_experts + # we padding globally so EP buffer allocation works + if quant_config and quant_config.get_name() == "mxfp4" and ( + envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): + hidden_size = round_up(hidden_size, 256) + # For smuggling this layer into the fused moe custom op compilation_config = vllm_config.compilation_config if prefix in compilation_config.static_forward_context: @@ -1064,6 +1071,18 @@ class FusedMoE(torch.nn.Module): shard_id: str, expert_id: int, return_success: bool = False) -> Optional[bool]: + + if self.quant_config and self.quant_config.get_name() == "mxfp4": + # (FIXME) for gpt-oss all experts are combined + if "bias" in weight_name: + dim1 = loaded_weight.shape[1] + param.data[:, :dim1].copy_(loaded_weight) + else: + dim1 = loaded_weight.shape[1] + dim2 = loaded_weight.shape[2] + param.data[:, :dim1, :dim2].copy_(loaded_weight) + return True if return_success else None + expert_id = self._map_global_expert_id_to_local_expert_id(expert_id) if expert_id == -1: # Failed to load this param since it's not local to this rank @@ -1476,13 +1495,20 @@ class FusedMoE(torch.nn.Module): def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor): + og_hidden_states = hidden_states.shape[-1] + if self.hidden_size != og_hidden_states: + hidden_states = F.pad(hidden_states, + (0, self.hidden_size - og_hidden_states), + mode='constant', + value=0.0) # TODO: Once the OOM issue for the TPU backend is resolved, we will # switch to using the moe_forward custom op. if current_platform.is_tpu(): return self.forward_impl(hidden_states, router_logits) else: - return torch.ops.vllm.moe_forward(hidden_states, router_logits, - self.layer_name) + return torch.ops.vllm.moe_forward( + hidden_states, router_logits, + self.layer_name)[..., :og_hidden_states] def forward_impl_chunked(self, full_hidden_states: torch.Tensor, full_router_logits: torch.Tensor): diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index 95aea912a1..8d63027e18 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -37,6 +37,7 @@ QuantizationMethods = Literal[ "auto-round", "rtn", "inc", + "mxfp4", ] QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods)) @@ -110,6 +111,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: from .marlin import MarlinConfig from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config from .moe_wna16 import MoeWNA16Config + from .mxfp4 import Mxfp4Config from .neuron_quant import NeuronQuantConfig from .ptpc_fp8 import PTPCFp8Config from .qqq import QQQConfig @@ -148,6 +150,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: "auto-round": AutoRoundConfig, "rtn": RTNConfig, "inc": INCConfig, + "mxfp4": Mxfp4Config, } # Update the `method_to_config` with customized quantization methods. method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py new file mode 100644 index 0000000000..b6d7bc5d5c --- /dev/null +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -0,0 +1,387 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Callable, Optional + +import torch +from torch.nn.parameter import Parameter + +from vllm import envs +from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, + FusedMoEMethodBase) +from vllm.model_executor.layers.linear import (LinearBase, + UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, QuantizeMethodBase) +from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( + _can_support_mxfp4) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + is_layer_skipped) +from vllm.model_executor.utils import set_weight_attrs +from vllm.utils import next_power_of_2, round_up + +if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): + # from flashinfer.fused_moe import cutlass_fused_moe + from flashinfer import (mxfp8_quantize, shuffle_matrix_a, + shuffle_matrix_sf_a, trtllm_fp4_block_scale_moe) + + +class Mxfp4Config(QuantizationConfig): + + def __init__(self, ignored_layers: Optional[list[str]] = None): + super().__init__() + self.ignored_layers = ignored_layers + + @classmethod + def from_config(cls, config): + return cls() + + @classmethod + def get_min_capability(cls) -> int: + return 100 + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "mxfp4" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.bfloat16] + + @classmethod + def get_config_filenames(cls) -> list[str]: + return [] + + def get_quant_method(self, layer: torch.nn.Module, + prefix: str) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import Attention # Avoid circular import + + if isinstance(layer, LinearBase): + if self.ignored_layers and is_layer_skipped( + prefix=prefix, + ignored_layers=self.ignored_layers, + fused_mapping=self.packed_modules_mapping): + return UnquantizedLinearMethod() + raise NotImplementedError("Mxfp4 linear layer is not implemented") + elif isinstance(layer, FusedMoE): + return Mxfp4MoEMethod(layer.moe_config) + elif isinstance(layer, Attention): + raise NotImplementedError( + "Mxfp4 attention layer is not implemented") + return None + + +class Mxfp4MoEMethod(FusedMoEMethodBase): + + def __init__(self, moe: FusedMoEConfig): + super().__init__() + self.topk_indices_dtype = None + self.moe = moe + + def create_weights(self, layer: torch.nn.Module, num_experts: int, + hidden_size: int, intermediate_size_per_partition: int, + params_dtype: torch.dtype, **extra_weight_attrs): + self.num_experts = num_experts + weight_dtype = torch.uint8 + scale_dtype = torch.uint8 + + # FIXME (zyongye): ship after torch and safetensors support mxfp4 + # is_torch_mxfp4_available = ( + # hasattr(torch, "float4_e2m1fn_x2") and + # hasattr(torch, "float8_e8m0fnu")) + # if is_torch_mxfp4_available: + # weight_dtype = torch.float4_e2m1fn_x2 + # scale_dtype = torch.float8_e8m0fnu + + mxfp4_block = 32 + + intermediate_size_per_partition_after_pad = \ + intermediate_size_per_partition + # pad the intermediate size to be a multiple of 2 * mxfp4_block + # for to hold non-uniform sharded tensor as well as swizzling + if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): + intermediate_size_per_partition_after_pad = round_up( + intermediate_size_per_partition, 256) + hidden_size = round_up(hidden_size, 256) + + self.intermediate_size = intermediate_size_per_partition_after_pad + self.hidden_size = hidden_size + # Fused gate_up_proj (column parallel) + w13_weight = torch.nn.Parameter(torch.zeros( + num_experts, + 2 * intermediate_size_per_partition_after_pad, + hidden_size // 2, + dtype=weight_dtype), + requires_grad=False) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + w13_weight_scale = torch.nn.Parameter(torch.zeros( + num_experts, + 2 * intermediate_size_per_partition_after_pad, + hidden_size // mxfp4_block, + dtype=scale_dtype), + requires_grad=False) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + + w13_bias = torch.nn.Parameter(torch.zeros( + num_experts, + 2 * intermediate_size_per_partition_after_pad, + dtype=torch.bfloat16), + requires_grad=False) + layer.register_parameter("w13_bias", w13_bias) + set_weight_attrs(w13_bias, extra_weight_attrs) + + # down_proj (row parallel) + w2_weight = torch.nn.Parameter(torch.zeros( + num_experts, + hidden_size, + intermediate_size_per_partition_after_pad // 2, + dtype=weight_dtype), + requires_grad=False) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + w2_weight_scale = torch.nn.Parameter(torch.zeros( + num_experts, + hidden_size, + intermediate_size_per_partition_after_pad // mxfp4_block, + dtype=scale_dtype), + requires_grad=False) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + w2_bias = torch.nn.Parameter(torch.zeros(num_experts, + hidden_size, + dtype=torch.bfloat16), + requires_grad=False) + layer.register_parameter("w2_bias", w2_bias) + set_weight_attrs(w2_bias, extra_weight_attrs) + + def process_weights_after_loading(self, layer): + if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): + layer.gemm1_alpha = Parameter(torch.tensor( + [1.702] * self.num_experts, dtype=torch.float32).cuda(), + requires_grad=False) + layer.gemm1_beta = Parameter(torch.tensor( + [1.0] * self.num_experts, dtype=torch.float32).cuda(), + requires_grad=False) + layer.gemm1_clamp_limit = Parameter(torch.tensor( + [7.0] * self.num_experts, dtype=torch.float32).cuda(), + requires_grad=False) + sf_block_size = 32 # mxfp4 block size + + assert (layer.w13_weight.dim() == 3 + and layer.w13_weight.shape[0] == self.num_experts + and layer.w13_weight.shape[1] == self.intermediate_size * 2 + and layer.w13_weight.shape[2] == self.hidden_size // 2) + assert (layer.w13_weight_scale.dim() == 3 + and layer.w13_weight_scale.shape[0] == self.num_experts + and layer.w13_weight_scale.shape[1] + == self.intermediate_size * 2 + and layer.w13_weight_scale.shape[2] + == self.hidden_size // sf_block_size) + assert (layer.w2_weight.dim() == 3 + and layer.w2_weight.shape[0] == self.num_experts + and layer.w2_weight.shape[1] == self.hidden_size and + layer.w2_weight.shape[2] == self.intermediate_size // 2) + assert (layer.w2_weight_scale.dim() == 3 + and layer.w2_weight_scale.shape[1] == self.hidden_size + and layer.w2_weight_scale.shape[2] + == self.intermediate_size // sf_block_size) + assert (layer.w13_bias.dim() == 2 + and layer.w13_bias.shape[0] == self.num_experts + and layer.w13_bias.shape[1] == self.intermediate_size * 2) + assert (layer.w2_bias.dim() == 2 + and layer.w2_bias.shape[0] == self.num_experts + and layer.w2_bias.shape[1] == self.hidden_size) + + w13_weight_scale = layer.w13_weight_scale.data + w2_weight_scale = layer.w2_weight_scale.data + w13_weight = layer.w13_weight.data + w2_weight = layer.w2_weight.data + w13_bias = layer.w13_bias.data.to(torch.float32) + w2_bias = layer.w2_bias.data.to(torch.float32) + + # Swap w1 and w3 as the defenition of + # swiglu is different in the trtllm-gen + def swap_every_two_rows(x, axis=-1): + shape = x.shape + if axis < 0: + axis = len(shape) + axis + + # Create a new shape with pairs swapped along specified axis + new_shape = list(shape) + new_shape[axis] = shape[axis] // 2 + new_shape.insert(axis + 1, 2) + + # Reshape to expose pairs, swap them, and reshape back + x = x.reshape(*new_shape) + x = x.flip(axis + 1) + new_shape = list(shape) + return x.reshape(*new_shape) + + w13_weight_scale = swap_every_two_rows(w13_weight_scale, -2) + w13_weight = swap_every_two_rows(w13_weight, -2) + w13_bias = swap_every_two_rows(w13_bias, -1) + + # Do not interleave as the checkpoint is already interleaved + + # Shuffle weights and scaling factors for transposed mma output + gemm1_weights_mxfp4_shuffled = [] + gemm1_scales_mxfp4_shuffled = [] + gemm2_weights_mxfp4_shuffled = [] + gemm2_scales_mxfp4_shuffled = [] + gemm1_bias_shuffled = [] + gemm2_bias_shuffled = [] + epilogue_tile_m = 128 # FIXME: this depends on the kernel internals + for i in range(self.num_experts): + gemm1_weights_mxfp4_shuffled.append( + shuffle_matrix_a(w13_weight[i].view(torch.uint8), + epilogue_tile_m)) + gemm1_scales_mxfp4_shuffled.append( + shuffle_matrix_sf_a(w13_weight_scale[i].view(torch.uint8), + epilogue_tile_m)) + gemm1_bias_shuffled.append( + shuffle_matrix_a(w13_bias[i].clone().reshape(-1, 1), + epilogue_tile_m)) + + gemm2_weights_mxfp4_shuffled.append( + shuffle_matrix_a(w2_weight[i].view(torch.uint8), + epilogue_tile_m)) + gemm2_scales_mxfp4_shuffled.append( + shuffle_matrix_sf_a(w2_weight_scale[i].view(torch.uint8), + epilogue_tile_m)) + gemm2_bias_shuffled.append( + shuffle_matrix_a(w2_bias[i].clone().reshape(-1, 1), + epilogue_tile_m)) + + w13_weight = torch.stack(gemm1_weights_mxfp4_shuffled) + w13_weight_scale = torch.stack( + gemm1_scales_mxfp4_shuffled).reshape( + self.num_experts, 2 * self.intermediate_size, + self.hidden_size // sf_block_size).view( + torch.float8_e4m3fn) + + w2_weight = torch.stack(gemm2_weights_mxfp4_shuffled) + w2_weight_scale = torch.stack(gemm2_scales_mxfp4_shuffled).reshape( + self.num_experts, self.hidden_size, self.intermediate_size // + sf_block_size).view(torch.float8_e4m3fn) + + layer.w13_weight = Parameter(w13_weight, requires_grad=False) + layer.w13_weight_scale = Parameter(w13_weight_scale, + requires_grad=False) + layer.w2_weight = Parameter(w2_weight, requires_grad=False) + layer.w2_weight_scale = Parameter(w2_weight_scale, + requires_grad=False) + layer.w13_bias = Parameter( + torch.stack(gemm1_bias_shuffled).reshape(self.num_experts, -1), + requires_grad=False) + layer.w2_bias = Parameter(torch.stack(gemm2_bias_shuffled).reshape( + self.num_experts, -1), + requires_grad=False) + return + + def _get_tile_tokens_dim(self, x: torch.Tensor, top_k: int): + # Number of tokens in the input tensor. + num_tokens = x.shape[0] + # Factor to account for the imbalance of the experts. + # factor equals to the + # max_real_num_tokens_per_expert / perfect_num_tokens_per_expert + # - 1.0 means perfect expert distribution. + # - > 1.0 means some experts have more + # tokens than the perfect distribution. + # - < 1.0 does not make sense. + imbalance_factor = 1.3 + # Calculate the number of tokens per expert + # assuming perfect distribution. + num_tokens_per_expert = (num_tokens * top_k) // self.num_experts + # Apply the imbalance factor. + num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor) + # And pad the number to the next power of 2. + tile_tokens_dim = next_power_of_2(num_tokens_per_expert) + # Cap to 8-64 tokens per CTA tile + # as it's the range supported by the kernel. + tile_tokens_dim = min(max(tile_tokens_dim, 8), 64) + + return tile_tokens_dim + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: Optional[torch.Tensor] = None, + logical_to_physical_map: Optional[torch.Tensor] = None, + logical_replica_count: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + + if enable_eplb: + raise NotImplementedError("EPLB is not supported for mxfp4") + + assert _can_support_mxfp4( + use_grouped_topk, topk_group, num_expert_group, expert_map, + custom_routing_function, e_score_correction_bias, + apply_router_weight_on_input, scoring_func, activation, + expert_load_view, logical_to_physical_map, + logical_replica_count), ("MXFP4 are not supported\ + with this configuration.") + + if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): + assert not self.moe.use_ep, ( + "EP is not supported for flashinfer mxfp4 moe backend yet.") + if envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: + assert x.dtype == torch.bfloat16 + x_quant = x + x_scale = None + else: + x_quant, x_scale = mxfp8_quantize(x, False) # to mxfp8 + x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1) + trtllm_gen_output = trtllm_fp4_block_scale_moe( + router_logits.to(torch.bfloat16), + None, # routing_bias + x_quant, + x_scale, + layer.w13_weight, # uint8 (e2m1 x 2) + layer.w13_weight_scale, # uint8 (e4m3 x 2) + layer.w13_bias, # fp32 per expert per channel + layer.gemm1_alpha, # fp32 per expert + layer.gemm1_beta, # fp32 per expert + layer.gemm1_clamp_limit, # fp32 per expert + layer.w2_weight, # uint8 (e2m1 x 2) + layer.w2_weight_scale, # ue8m0 + layer.w2_bias, # fp32 per expert per channel + None, # output1_scale_scalar + None, # output1_scale_gate_scalar + None, # output2_scale_scalar + self.num_experts, + top_k, + None, # n_group + None, # topk_group + self.intermediate_size, # padded to multiple of 256 + 0, # local_expert_offset + self.num_experts, # local num experts + None, + self._get_tile_tokens_dim(x, top_k), + 1 if renormalize else 0, # routing_method_type, renormalize + True, # do finalize + )[0] + return trtllm_gen_output diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index 1119045db0..4a4e199e13 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Callable, Optional + import torch from vllm.utils import direct_register_custom_op @@ -7,6 +9,26 @@ from vllm.utils import direct_register_custom_op OCP_MX_BLOCK_SIZE = 32 +def _can_support_mxfp4(use_grouped_topk: bool = False, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + expert_map: Optional[torch.Tensor] = None, + custom_routing_function: Optional[Callable] = None, + e_score_correction_bias: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + scoring_func: str = "softmax", + activation: str = "silu", + expert_load_view: Optional[torch.Tensor] = None, + logical_to_physical_map: Optional[torch.Tensor] = None, + logical_replica_count: Optional[torch.Tensor] = None): + return not (use_grouped_topk or topk_group or num_expert_group + or expert_map or custom_routing_function + or e_score_correction_bias or apply_router_weight_on_input + or scoring_func != "softmax" or activation != "silu" + or expert_load_view or logical_to_physical_map + or logical_replica_count) + + def _dequant_mxfp4(x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype) -> torch.Tensor: try: From 46a13949d5f64e4a40bac3cb30eb0f867074f741 Mon Sep 17 00:00:00 2001 From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com> Date: Thu, 7 Aug 2025 03:03:42 +0300 Subject: [PATCH 031/932] [v1] - Mamba1 Attention Metadata (#21249) Signed-off-by: asafg Co-authored-by: asafg --- csrc/mamba/mamba_ssm/selective_scan.h | 3 + csrc/mamba/mamba_ssm/selective_scan_fwd.cu | 18 ++- docs/models/supported_models.md | 4 +- docs/usage/v1_guide.md | 12 +- .../models/language/generation/test_hybrid.py | 2 + tests/v1/test_oracle.py | 1 - .../layers/mamba/mamba_mixer.py | 144 +++++++++++++----- .../layers/mamba/mamba_mixer2.py | 7 +- .../layers/mamba/mamba_utils.py | 99 +++++++----- vllm/model_executor/models/bamba.py | 5 +- vllm/model_executor/models/falcon_h1.py | 5 +- .../model_executor/models/granitemoehybrid.py | 5 +- vllm/model_executor/models/jamba.py | 60 +++++--- vllm/model_executor/models/mamba.py | 77 ++++++---- vllm/model_executor/models/mamba2.py | 5 +- vllm/model_executor/models/nemotron_h.py | 5 +- vllm/model_executor/models/zamba2.py | 5 +- vllm/v1/attention/backends/mamba1_attn.py | 67 ++++++++ vllm/v1/attention/backends/mamba_selectors.py | 4 + 19 files changed, 367 insertions(+), 161 deletions(-) create mode 100644 vllm/v1/attention/backends/mamba1_attn.py diff --git a/csrc/mamba/mamba_ssm/selective_scan.h b/csrc/mamba/mamba_ssm/selective_scan.h index 563d2fe4ef..13c6178941 100644 --- a/csrc/mamba/mamba_ssm/selective_scan.h +++ b/csrc/mamba/mamba_ssm/selective_scan.h @@ -45,6 +45,9 @@ struct SSMParamsBase { index_t out_d_stride; index_t out_z_batch_stride; index_t out_z_d_stride; + index_t ssm_states_batch_stride; + index_t ssm_states_dim_stride; + index_t ssm_states_dstate_stride; // Common data pointers. void *__restrict__ A_ptr; diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu index 5766fbab4e..c4ddbc1427 100644 --- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu +++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu @@ -132,8 +132,10 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { input_t *Bvar = reinterpret_cast(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride; weight_t *C = reinterpret_cast(params.C_ptr) + dim_id * kNRows * params.C_d_stride; input_t *Cvar = reinterpret_cast(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride; - input_t *ssm_states = reinterpret_cast(params.ssm_states_ptr) + (cache_index * params.dim + dim_id * kNRows) * params.dstate; - + input_t *ssm_states = reinterpret_cast(params.ssm_states_ptr) + + cache_index * params.ssm_states_batch_stride + + dim_id * kNRows * params.ssm_states_dim_stride; + float D_val[kNRows] = {0}; if (params.D_ptr != nullptr) { #pragma unroll @@ -248,7 +250,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { } // Initialize running total - scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx]): 0.0); + scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx * params.ssm_states_dstate_stride]): 0.0); SSMScanPrefixCallbackOp prefix_op(running_prefix); typename Ktraits::BlockScanT(smem_scan).InclusiveScan( @@ -259,7 +261,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { if (threadIdx.x == 0) { smem_running_prefix[state_idx] = prefix_op.running_prefix; if (chunk == n_chunks - 1) { - ssm_states[state_idx] = input_t(prefix_op.running_prefix.y); + ssm_states[state_idx * params.ssm_states_dstate_stride] = input_t(prefix_op.running_prefix.y); } } #pragma unroll @@ -481,6 +483,10 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, params.out_batch_stride = out.stride(1); params.out_d_stride = out.stride(0); + params.ssm_states_batch_stride = ssm_states.stride(0); + params.ssm_states_dim_stride = ssm_states.stride(1); + params.ssm_states_dstate_stride = ssm_states.stride(2); + } else{ if (!is_variable_B) { @@ -509,6 +515,10 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, } params.out_batch_stride = out.stride(0); params.out_d_stride = out.stride(1); + + params.ssm_states_batch_stride = ssm_states.stride(0); + params.ssm_states_dim_stride = ssm_states.stride(1); + params.ssm_states_dstate_stride = ssm_states.stride(2); } } diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 120fd3f485..3816412268 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -370,9 +370,9 @@ th { | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | ✅︎ | -| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | | +| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ | | `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | | +| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | ✅︎ | | `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | ✅︎ | | `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ | ✅︎ | | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | ✅︎ | diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 38399c6633..d30144e8a8 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -83,7 +83,7 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the | **Decoder-only Models** | 🚀 Optimized | | **Encoder-Decoder Models** | 🟠 Delayed | | **Embedding Models** | 🟢 Functional | -| **Mamba Models** | 🟢 (Mamba-2), 🟡 (Mamba-1) | +| **Mamba Models** | 🟢 (Mamba-2), 🟢 (Mamba-1) | | **Multimodal Models** | 🟢 Functional | vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol. @@ -104,13 +104,11 @@ to enable simultaneous generation and embedding using the same engine instance i #### Mamba Models -Models using selective state-space mechanisms instead of standard transformer attention are partially supported. -Models that use Mamba-2 layers (e.g., `Mamba2ForCausalLM`) are supported, but models that use older Mamba-1 layers -(e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet supported. Please note that these models currently require -disabling prefix caching in V1. +Models using selective state-space mechanisms instead of standard transformer attention are supported. +Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1. Additionally, Mamba-1 models require `enforce_eager=True`. -Models that combine Mamba-2 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`, -`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`). Please note that +Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`, +`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that these models currently require disabling prefix caching and using the FlashInfer attention backend in V1. #### Encoder-Decoder Models diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 2238924c1b..67ba2f2559 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -53,6 +53,8 @@ HF_UNSUPPORTED_MODELS = [ ] V1_SUPPORTED_MODELS = [ + "state-spaces/mamba-130m-hf", + "ai21labs/Jamba-tiny-dev", "mistralai/Mamba-Codestral-7B-v0.1", "ibm-ai-platform/Bamba-9B-v1", "Zyphra/Zamba2-1.2B-instruct", diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index b68ed298a1..a756c89b52 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -12,7 +12,6 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine UNSUPPORTED_MODELS_V1 = [ "openai/whisper-large-v3", # transcription "facebook/bart-large-cnn", # encoder decoder - "state-spaces/mamba-130m-hf", # mamba1 ] MODEL = "meta-llama/Llama-3.2-1B-Instruct" diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index 60cf3e1188..17b7f84a93 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -1,30 +1,37 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + import torch from torch import nn from torch.nn.parameter import Parameter -from vllm.attention.backends.abstract import AttentionMetadata +from vllm import envs +from vllm.config import get_current_vllm_config from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.forward_context import get_forward_context +from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, RowParallelLinear) +from vllm.model_executor.layers.mamba.abstract import MambaBase +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateShapeCalculator) from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( causal_conv1d_fn, causal_conv1d_update) from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( selective_scan_fn, selective_state_update) from vllm.model_executor.models.mamba_cache import MambaCacheParams from vllm.model_executor.utils import set_weight_attrs +from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionMetadata # Adapted from transformers.models.mamba.modeling_mamba.MambaMixer @CustomOp.register("mamba_mixer") -class MambaMixer(CustomOp): +class MambaMixer(MambaBase, CustomOp): """ Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`. A, D are input independent @@ -47,13 +54,16 @@ class MambaMixer(CustomOp): rms_norm_has_weight: bool = True, rms_norm_eps: float = 1e-5, activation="silu", - is_lora_enabled: bool = False): + is_lora_enabled: bool = False, + prefix: str = ""): super().__init__() self.time_step_rank = time_step_rank self.ssm_state_size = ssm_state_size self.use_rms_norm = use_rms_norm self.activation = activation self.is_lora_enabled = is_lora_enabled + self.conv_kernel_size = conv_kernel_size + self.intermediate_size = intermediate_size self.conv1d = ColumnParallelLinear( input_size=conv_kernel_size, @@ -131,14 +141,62 @@ class MambaMixer(CustomOp): has_weight=rms_norm_has_weight, ) if use_rms_norm else None - def forward_native(self, hidden_states: torch.Tensor, - conv_state: torch.Tensor, ssm_state: torch.Tensor): + if envs.VLLM_USE_V1: + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + # The outer list is for v0 PP virtual engine. Though this code path + # only runs for v1, we have to do this to unify with the interface + # of Attention + v0 PP. + # The inner tuple is (conv_state, ssm_state) + self.kv_cache = [(torch.tensor([]), torch.tensor([]))] + + self.prefix = prefix + + def forward(self, + hidden_states: torch.Tensor, + mamba_cache_params: Optional[MambaCacheParams] = None): + if not envs.VLLM_USE_V1: + return CustomOp.forward(self, hidden_states, mamba_cache_params) + else: + return self.forward_cuda(hidden_states, mamba_cache_params) + + def forward_native(self, + hidden_states: torch.Tensor, + mamba_cache_params: Optional[MambaCacheParams] = None): pass - def forward_cuda(self, hidden_states: torch.Tensor, - mamba_cache_params: MambaCacheParams): + def forward_cuda(self, + hidden_states: torch.Tensor, + mamba_cache_params: Optional[MambaCacheParams] = None): - attn_metadata: AttentionMetadata = get_forward_context().attn_metadata + forward_context: ForwardContext = get_forward_context() + attn_metadata = forward_context.attn_metadata + + if envs.VLLM_USE_V1: + if attn_metadata is not None: + assert isinstance(attn_metadata, dict) + attn_metadata = attn_metadata[self.prefix] + mamba1_metadata = attn_metadata + assert isinstance(mamba1_metadata, Mamba1AttentionMetadata) + query_start_loc = mamba1_metadata.query_start_loc + state_indices_tensor = mamba1_metadata.state_indices_tensor + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + conv_state = self_kv_cache[0].transpose(-1, -2) + ssm_state = self_kv_cache[1] + has_initial_state = mamba1_metadata.has_initial_states + context_lens_tensor = mamba1_metadata.context_lens_tensor + else: + assert mamba_cache_params is not None + conv_state = mamba_cache_params.conv_state + ssm_state = mamba_cache_params.ssm_state + state_indices_tensor = mamba_cache_params.state_indices_tensor + query_start_loc = attn_metadata.query_start_loc + context_lens_tensor = attn_metadata.context_lens_tensor + + if context_lens_tensor is not None: + has_initial_state = context_lens_tensor > 0 # 1. Gated MLP's linear projection projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1) @@ -148,8 +206,12 @@ class MambaMixer(CustomOp): conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2)) - if attn_metadata.query_start_loc is not None \ - and attn_metadata.context_lens_tensor is not None: + if envs.VLLM_USE_V1 and attn_metadata is None: + # V1 profile run + hidden_states = hidden_states.contiguous() + return self.out_proj(hidden_states.transpose(-2, -1))[0] + + if query_start_loc is not None and context_lens_tensor is not None: # |---------- N-1 iteration --------| # |---------------- N iteration ---------------------| # |- tokenA -|......................|-- newTokens ---| @@ -161,18 +223,18 @@ class MambaMixer(CustomOp): conv_weights, bias=self.conv1d.bias, activation=self.activation, - conv_states=mamba_cache_params.conv_state, - has_initial_state=attn_metadata.context_lens_tensor > 0, - cache_indices=mamba_cache_params.state_indices_tensor, - query_start_loc=attn_metadata.query_start_loc) + conv_states=conv_state, + has_initial_state=has_initial_state, + cache_indices=state_indices_tensor, + query_start_loc=query_start_loc) else: hidden_states = causal_conv1d_update( hidden_states.transpose(0, 1), - mamba_cache_params.conv_state, + conv_state, conv_weights, self.conv1d.bias, self.activation, - conv_state_indices=mamba_cache_params.state_indices_tensor) + conv_state_indices=state_indices_tensor) hidden_states = hidden_states.transpose(0, 1) # 3. State Space Model sequence transformation @@ -203,11 +265,10 @@ class MambaMixer(CustomOp): time_proj_bias = (self.dt_proj.bias.float() if hasattr( self.dt_proj, "bias") else None) - if attn_metadata.query_start_loc is not None \ - and attn_metadata.context_lens_tensor is not None: + if query_start_loc is not None and context_lens_tensor is not None: scan_outputs = selective_scan_fn( hidden_states, - mamba_cache_params.ssm_state, + ssm_state, discrete_time_step, self.A, B.transpose(-2, -1), @@ -216,24 +277,23 @@ class MambaMixer(CustomOp): gate, time_proj_bias, delta_softplus=True, - cache_indices=mamba_cache_params.state_indices_tensor, - has_initial_state=attn_metadata.context_lens_tensor > 0, - query_start_loc=attn_metadata.query_start_loc) + cache_indices=state_indices_tensor, + has_initial_state=has_initial_state, + query_start_loc=query_start_loc) else: scan_outputs = torch.empty_like(hidden_states.transpose(0, 1)) - selective_state_update( - mamba_cache_params.ssm_state, - hidden_states.transpose(0, 1), - discrete_time_step.transpose(0, 1), - self.A, - B, - C, - self.D, - gate.transpose(0, 1), - time_proj_bias, - dt_softplus=True, - state_batch_indices=mamba_cache_params.state_indices_tensor, - out=scan_outputs) + selective_state_update(ssm_state, + hidden_states.transpose(0, 1), + discrete_time_step.transpose(0, 1), + self.A, + B, + C, + self.D, + gate.transpose(0, 1), + time_proj_bias, + dt_softplus=True, + state_batch_indices=state_indices_tensor, + out=scan_outputs) scan_outputs = scan_outputs.transpose(0, 1) # 4. Final linear projection @@ -245,3 +305,15 @@ class MambaMixer(CustomOp): contextualized_states = self.out_proj( scan_outputs.transpose(-2, -1))[0] return contextualized_states + + def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: + return MambaStateShapeCalculator.mamba1_state_shape( + tp_world_size=get_tensor_model_parallel_world_size(), + intermediate_size=self.intermediate_size, + state_size=self.ssm_state_size, + conv_kernel=self.conv_kernel_size, + ) + + @property + def mamba_type(self) -> str: + return "mamba1" diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 5ac9a7f9ab..d5f4877135 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -21,7 +21,7 @@ from vllm.model_executor.layers.mamba.abstract import MambaBase from vllm.model_executor.layers.mamba.mamba2_metadata import (Mamba2Metadata, update_metadata) from vllm.model_executor.layers.mamba.mamba_utils import ( - extra_groups_for_head_shards, get_mamba_state_shape) + MambaStateShapeCalculator) from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( causal_conv1d_fn, causal_conv1d_update) from vllm.model_executor.layers.mamba.ops.layernorm_gated import rms_norm_gated @@ -278,8 +278,9 @@ class MambaMixer2(MambaBase, CustomOp): # - for TP we shard conv_dim by sharding on n_groups, # - but if n_groups cannot divide tp_size, we need to # extend some extra groups - self.n_groups = n_groups + extra_groups_for_head_shards( + groups = MambaStateShapeCalculator.extra_groups_for_head_shards( n_groups, self.tp_size) + self.n_groups = n_groups + groups self.conv_dim = intermediate_size + 2 * self.n_groups * ssm_state_size self.conv1d = ColumnParallelLinear( @@ -732,7 +733,7 @@ class MambaMixer2(MambaBase, CustomOp): output[:num_actual_tokens], _ = self.out_proj(hidden_states) def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: - return get_mamba_state_shape( + return MambaStateShapeCalculator.mamba2_state_shape( intermediate_size=self.intermediate_size, tp_world_size=get_tensor_model_parallel_world_size(), n_groups=self.n_groups, diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py index 99a582066c..42c815b08f 100644 --- a/vllm/model_executor/layers/mamba/mamba_utils.py +++ b/vllm/model_executor/layers/mamba/mamba_utils.py @@ -3,53 +3,70 @@ from vllm.distributed import divide -def extra_groups_for_head_shards(ngroups: int, tp_size: int): - """Compute the increase in group numbers to account for - replication in order to accompany the head shards.""" +class MambaStateShapeCalculator: - # in the case ngoups % tp_size == 0, this will be zero - if ngroups % tp_size == 0: - return 0 + @classmethod + def mamba1_state_shape( + cls, + tp_world_size: int, + intermediate_size: int, + state_size: int, + conv_kernel: int, + use_v1: bool = True, + ) -> tuple[tuple[int, int], tuple[int, int]]: + conv_state_shape = (divide(intermediate_size, + tp_world_size), conv_kernel - 1) - # for n_groups == 1, this is exactly tp_size - n_groups - return tp_size - ngroups + temporal_state_shape = (divide(intermediate_size, + tp_world_size), state_size) + # In V0, the conv_state shape was swapped during allocation in + # MambaCacheManager, but in V1 it needs to be determined here at the + # calculation level + if use_v1: + conv_state_shape = conv_state_shape[1], conv_state_shape[0] -def get_mamba_state_shape( - intermediate_size: int, - tp_world_size: int, - n_groups: int, - num_heads: int, - head_dim: int, - state_size: int, - conv_kernel: int, - use_v1: bool = True, -) -> tuple[tuple[int, int], tuple[int, int, int]]: - """ Get the shape of mamba state.""" + return conv_state_shape, temporal_state_shape - # if n_groups is not divisible by world_size, need to extend the shards - # to ensure all groups needed by a head is sharded along with it - n_groups = (n_groups + - extra_groups_for_head_shards(n_groups, tp_world_size)) + @classmethod + def mamba2_state_shape( + cls, + tp_world_size: int, + intermediate_size: int, + n_groups: int, + num_heads: int, + head_dim: int, + state_size: int, + conv_kernel: int, + use_v1: bool = True, + ) -> tuple[tuple[int, int], tuple[int, int, int]]: + # if n_groups is not divisible by world_size, need to extend the shards + # to ensure all groups needed by a head is sharded along with it + n_groups = n_groups + cls.extra_groups_for_head_shards( + n_groups, tp_world_size) + # heads and n_groups are TP-ed + conv_dim = intermediate_size + 2 * n_groups * state_size - # - heads and n_groups are TP-ed - conv_dim = (intermediate_size + 2 * n_groups * state_size) - # contiguous along 'dim' axis - conv_state_shape = ( - conv_kernel - 1, - divide(conv_dim, tp_world_size), - ) + # contiguous along 'dim' axis + conv_state_shape = (conv_kernel - 1, divide(conv_dim, tp_world_size)) + if not use_v1: + conv_state_shape = conv_state_shape[1], conv_state_shape[0] - if not use_v1: - conv_state_shape = (conv_state_shape[1], conv_state_shape[0]) + # These are not TP-ed as they depend on A, dt_bias, D + # - they are typically small + # e.g., (h_heads, head_dim, state_size) = (128, 64, 128) + temporal_state_shape = (divide(num_heads, + tp_world_size), head_dim, state_size) + return conv_state_shape, temporal_state_shape - # These are not TP-ed as they depend on A, dt_bias, D - # - they are typically small - # e.g., (h_heads, head_dim, state_size) = (128, 64, 128) - temporal_state_shape = ( - divide(num_heads, tp_world_size), - head_dim, - state_size, - ) + @classmethod + def extra_groups_for_head_shards(cls, ngroups: int, tp_size: int): + """Compute the increase in group numbers to account for + replication in order to accompany the head shards.""" - return conv_state_shape, temporal_state_shape + # in the case ngoups % tp_size == 0, this will be zero + if ngroups % tp_size == 0: + return 0 + + # for n_groups == 1, this is exactly tp_size - n_groups + return tp_size - ngroups diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 0f54944276..4a2ae07581 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -25,7 +25,8 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.mamba2_metadata import ( Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 -from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateShapeCalculator) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -457,7 +458,7 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, hf_config = vllm_config.model_config.hf_config intermediate_size = hf_config.mamba_expand * hf_config.hidden_size - return get_mamba_state_shape( + return MambaStateShapeCalculator.mamba2_state_shape( intermediate_size=intermediate_size, tp_world_size=parallel_config.tensor_parallel_size, n_groups=hf_config.mamba_n_groups, diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 6a58b1501f..85d64af5bd 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -24,7 +24,8 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.mamba2_metadata import ( Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 -from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateShapeCalculator) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -543,7 +544,7 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, if hf_config.mamba_d_ssm is None else hf_config.mamba_d_ssm) - return get_mamba_state_shape( + return MambaStateShapeCalculator.mamba2_state_shape( intermediate_size=intermediate_size, tp_world_size=parallel_config.tensor_parallel_size, n_groups=hf_config.mamba_n_groups, diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 59c1dce48e..e59502f12a 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -23,7 +23,8 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.mamba2_metadata import ( Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 -from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateShapeCalculator) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -547,7 +548,7 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA, hf_config = vllm_config.model_config.hf_config intermediate_size = hf_config.mamba_expand * hf_config.hidden_size - return get_mamba_state_shape( + return MambaStateShapeCalculator.mamba2_state_shape( intermediate_size=intermediate_size, tp_world_size=parallel_config.tensor_parallel_size, n_groups=hf_config.mamba_n_groups, diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index ab21b7ce2c..c1033aff07 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -8,6 +8,7 @@ import torch from torch import nn from transformers import JambaConfig +from vllm import envs from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size @@ -19,6 +20,8 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateShapeCalculator) from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler, PoolingType) from vllm.model_executor.layers.quantization import QuantizationConfig @@ -32,8 +35,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import LayerBlockType -from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, - SupportsV0Only) +from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -112,7 +114,8 @@ class JambaMambaDecoderLayer(nn.Module): use_rms_norm=True, rms_norm_eps=config.rms_norm_eps, activation=config.hidden_act, - is_lora_enabled = self.is_lora_enabled + is_lora_enabled = self.is_lora_enabled, + prefix=f"{prefix}.mixer", ) num_experts = config.layers_num_experts[layer_idx] @@ -344,7 +347,8 @@ class JambaModel(nn.Module): layer_mamba_cache_params = None if isinstance(layer, JambaAttentionDecoderLayer): kv_cache_index += 1 - if isinstance(layer, JambaMambaDecoderLayer): + if isinstance(layer, + JambaMambaDecoderLayer) and mamba_cache_params: current_state_layer = mamba_cache_index layer_mamba_cache_params = mamba_cache_params.at_layer_idx( current_state_layer) @@ -442,7 +446,7 @@ class JambaModel(nn.Module): class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, - IsHybrid, SupportsV0Only): + IsHybrid): hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={ ".self_attn.": ".", ".A_log": ".A" @@ -509,14 +513,19 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): - if self.mamba_cache is None: - num_mamba_layers = self.model_config.get_num_layers_by_block_type( - self.vllm_config.parallel_config, LayerBlockType.mamba) - self.mamba_cache = MambaCacheManager( - self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers, - *self._get_mamba_cache_shape()) + # NOTE: mamba_cache_params is not needed for v1 + mamba_cache_params = None + if not envs.VLLM_USE_V1: + if self.mamba_cache is None: + num_layers = self.model_config.get_num_layers_by_block_type( + self.vllm_config.parallel_config, LayerBlockType.mamba) + state_shape = self.get_mamba_state_shape_from_config( + self.vllm_config) + self.mamba_cache = MambaCacheManager(self.vllm_config, + self.lm_head.weight.dtype, + num_layers, *state_shape) - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) + mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) hidden_states = self.model(input_ids, positions, mamba_cache_params, intermediate_tensors, inputs_embeds) @@ -529,19 +538,22 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) - def _get_mamba_cache_shape( - self) -> tuple[tuple[int, int], tuple[int, int]]: - world_size = get_tensor_model_parallel_world_size() - hidden_size = self.config.hidden_size - conv_state_shape = ( - self.config.mamba_expand * hidden_size // world_size, - self.config.mamba_d_conv - 1, + @classmethod + def get_mamba_state_shape_from_config( + cls, + vllm_config: "VllmConfig", + ) -> tuple[tuple[int, int], tuple[int, int]]: + parallel_config = vllm_config.parallel_config + hf_config = vllm_config.model_config.hf_config + hidden_size = hf_config.hidden_size + + return MambaStateShapeCalculator.mamba1_state_shape( + tp_world_size=parallel_config.tensor_parallel_size, + intermediate_size=hf_config.mamba_expand * hidden_size, + state_size=hf_config.mamba_d_state, + conv_kernel=hf_config.mamba_d_conv, + use_v1=envs.VLLM_USE_V1, ) - temporal_state_shape = ( - self.config.mamba_expand * hidden_size // world_size, - self.config.mamba_d_state, - ) - return conv_state_shape, temporal_state_shape def compute_logits( self, diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 8162ac3f75..80b63e1537 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -8,20 +8,21 @@ import torch from torch import nn from transformers import MambaConfig +from vllm import envs from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateShapeCalculator) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import (HasInnerState, - IsAttentionFree, SupportsPP, - SupportsV0Only) + IsAttentionFree, SupportsPP) from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -41,7 +42,8 @@ class MambaDecoderLayer(nn.Module): config: MambaConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - is_lora_enabled: Optional[bool] = False) -> None: + is_lora_enabled: Optional[bool] = False, + prefix: str = "") -> None: super().__init__() self.config = config self.is_falcon_mamba = config.model_type == "falcon_mamba" @@ -58,7 +60,8 @@ class MambaDecoderLayer(nn.Module): rms_norm_has_weight=not self.is_falcon_mamba, rms_norm_eps=mixer_rms_eps, activation=config.hidden_act, - is_lora_enabled=self.is_lora_enabled) + is_lora_enabled=self.is_lora_enabled, + prefix=f"{prefix}.mixer") self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) @@ -107,7 +110,8 @@ class MambaModel(nn.Module): lambda prefix: MambaDecoderLayer(config, cache_config=cache_config, quant_config=quant_config, - is_lora_enabled=is_lora_enabled), + is_lora_enabled=is_lora_enabled, + prefix=prefix), prefix=f"{prefix}.layers") self.norm_f = RMSNorm(config.hidden_size, @@ -123,7 +127,7 @@ class MambaModel(nn.Module): self, input_ids: torch.Tensor, positions: torch.Tensor, - mamba_cache_params: MambaCacheParams, + mamba_cache_params: Optional[MambaCacheParams] = None, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -140,12 +144,17 @@ class MambaModel(nn.Module): for i in range(self.start_layer, self.end_layer): layer = self.layers[i] + + layer_cache_params = None + if mamba_cache_params is not None: + layer_cache_params = mamba_cache_params.at_layer_idx( + i - self.start_layer) + hidden_states, residual = layer( positions=positions, hidden_states=hidden_states, residual=residual, - mamba_cache_params=mamba_cache_params.at_layer_idx( - i - self.start_layer)) + mamba_cache_params=layer_cache_params) if not get_pp_group().is_last_rank: return IntermediateTensors({ "hidden_states": hidden_states, @@ -176,8 +185,7 @@ class MambaModel(nn.Module): return loaded_params -class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP, - SupportsV0Only): +class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config @@ -227,20 +235,40 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): - if self.mamba_cache is None: - num_mamba_layers = self.model_config.get_num_layers_by_block_type( - self.vllm_config.parallel_config, LayerBlockType.mamba) - self.mamba_cache = MambaCacheManager( - self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers, - *self._get_mamba_cache_shape()) - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) + mamba_cache_params = None + if not envs.VLLM_USE_V1: + if self.mamba_cache is None: + num_layers = self.model_config.get_num_layers_by_block_type( + self.vllm_config.parallel_config, LayerBlockType.mamba) + state_shape = self.get_mamba_state_shape_from_config( + self.vllm_config) + self.mamba_cache = MambaCacheManager(self.vllm_config, + self.lm_head.weight.dtype, + num_layers, *state_shape) + + mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) hidden_states = self.backbone(input_ids, positions, mamba_cache_params, intermediate_tensors, inputs_embeds) return hidden_states + @classmethod + def get_mamba_state_shape_from_config( + cls, + vllm_config: "VllmConfig", + ) -> tuple[tuple[int, int], tuple[int, int]]: + parallel_config = vllm_config.parallel_config + hf_config = vllm_config.model_config.hf_config + + return MambaStateShapeCalculator.mamba1_state_shape( + tp_world_size=parallel_config.tensor_parallel_size, + intermediate_size=hf_config.intermediate_size, + state_size=hf_config.state_size, + conv_kernel=hf_config.conv_kernel, + use_v1=envs.VLLM_USE_V1) + def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): return self.mamba_cache.copy_inputs_before_cuda_graphs( input_buffers, **kwargs) @@ -248,19 +276,6 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP, def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) - def _get_mamba_cache_shape( - self) -> tuple[tuple[int, int], tuple[int, int]]: - world_size = get_tensor_model_parallel_world_size() - conv_state_shape = ( - self.config.intermediate_size // world_size, - self.config.conv_kernel - 1, - ) - temporal_state_shape = ( - self.config.intermediate_size // world_size, - self.config.state_size, - ) - return conv_state_shape, temporal_state_shape - def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata) -> torch.Tensor: logits = self.logits_processor(self.lm_head, hidden_states, diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index adad181617..75e92b0176 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -19,7 +19,8 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.mamba2_metadata import ( Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 -from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateShapeCalculator) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -220,7 +221,7 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree): hf_config = vllm_config.model_config.hf_config intermediate_size = hf_config.expand * hf_config.hidden_size - return get_mamba_state_shape( + return MambaStateShapeCalculator.mamba2_state_shape( intermediate_size=intermediate_size, tp_world_size=parallel_config.tensor_parallel_size, n_groups=hf_config.n_groups, diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 6a999e2254..eb62d5a53c 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -39,7 +39,8 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.mamba2_metadata import ( Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 -from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateShapeCalculator) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) @@ -482,7 +483,7 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, hf_config = vllm_config.model_config.hf_config intermediate_size = hf_config.expand * hf_config.hidden_size - return get_mamba_state_shape( + return MambaStateShapeCalculator.mamba2_state_shape( intermediate_size=intermediate_size, tp_world_size=parallel_config.tensor_parallel_size, n_groups=hf_config.n_groups, diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 7764fd9b9e..4cb0becf30 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -32,7 +32,8 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.mamba2_metadata import ( Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 -from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateShapeCalculator) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -869,7 +870,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): hf_config = vllm_config.model_config.hf_config intermediate_size = hf_config.mamba_expand * hf_config.hidden_size - return get_mamba_state_shape( + return MambaStateShapeCalculator.mamba2_state_shape( intermediate_size=intermediate_size, tp_world_size=parallel_config.tensor_parallel_size, n_groups=hf_config.mamba_ngroups, diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py new file mode 100644 index 0000000000..f0e4636fdb --- /dev/null +++ b/vllm/v1/attention/backends/mamba1_attn.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass +from typing import ClassVar + +import torch + +from vllm.attention.backends.abstract import AttentionBackend +from vllm.config import VllmConfig +from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, + CommonAttentionMetadata) +from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec + + +class Mamba1AttentionBackend(AttentionBackend): + + @staticmethod + def get_builder_cls() -> type["Mamba1AttentionMetadataBuilder"]: + return Mamba1AttentionMetadataBuilder + + +@dataclass +class Mamba1AttentionMetadata: + query_start_loc: torch.Tensor + context_lens_tensor: torch.Tensor + state_indices_tensor: torch.Tensor + has_initial_states: torch.Tensor + + +class Mamba1AttentionMetadataBuilder( + AttentionMetadataBuilder[Mamba1AttentionMetadata]): + + reorder_batch_threshold: ClassVar[int] = 1 + + def __init__( + self, + kv_cache_spec: AttentionSpec, + vllm_config: VllmConfig, + device: torch.device, + layer_names: list[str], + ): + assert isinstance(kv_cache_spec, MambaSpec) + self.kv_cache_spec = kv_cache_spec + self.device = device + self.vllm_config = vllm_config + self.layer_names = layer_names + + def build( + self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False, + ) -> Mamba1AttentionMetadata: + query_start_loc = common_attn_metadata.query_start_loc + + state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] + context_lens_tensor = common_attn_metadata.num_computed_tokens_cpu.to( + query_start_loc.device) + has_initial_states = (context_lens_tensor > 0) + + return Mamba1AttentionMetadata( + query_start_loc=query_start_loc, + context_lens_tensor=context_lens_tensor, + has_initial_states=has_initial_states, + state_indices_tensor=state_indices_tensor, + ) diff --git a/vllm/v1/attention/backends/mamba_selectors.py b/vllm/v1/attention/backends/mamba_selectors.py index 80021a2165..f56f2fb7bf 100644 --- a/vllm/v1/attention/backends/mamba_selectors.py +++ b/vllm/v1/attention/backends/mamba_selectors.py @@ -1,10 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.attention.backends.abstract import AttentionBackend +from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend def get_mamba_attn_backend(mamba_type: str) -> type[AttentionBackend]: + if mamba_type == "mamba1": + return Mamba1AttentionBackend + if mamba_type == "mamba2": return Mamba2AttentionBackend From eec890c1c1cdf6d4bbf4c0563fac54abe80ab8b6 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 6 Aug 2025 20:03:53 -0400 Subject: [PATCH 032/932] [Bug] Fix B200 DeepGEMM E8M0 Accuracy Issue (#22399) Signed-off-by: yewentao256 --- vllm/model_executor/layers/quantization/utils/fp8_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 2aece9a1de..68a061968a 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -799,7 +799,8 @@ def requant_weight_ue8m0_inplace( s_exp = s_exp[:m_cur, :k_cur] w_dq = w_q.to(torch.float32) * s_exp # Re-quantise using power-of-two scaling (UE8M0). - w_requant, s_requant = per_block_cast_to_fp8(w_dq, [block_m, block_k]) + w_requant, s_requant = per_block_cast_to_fp8(w_dq, [block_m, block_k], + use_ue8m0=True) # Write back the results in-place. w_q.copy_(w_requant) From 19c9365aa48d514ae6ef45242359dc98c6046666 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 6 Aug 2025 17:47:14 -0700 Subject: [PATCH 033/932] [gpt-oss] add demo tool server (#22393) Signed-off-by: Chen Zhang --- vllm/entrypoints/openai/api_server.py | 7 ++ vllm/entrypoints/openai/cli_args.py | 4 ++ vllm/entrypoints/openai/serving_responses.py | 4 ++ vllm/entrypoints/tool_server.py | 70 ++++++++++++++++++++ 4 files changed, 85 insertions(+) create mode 100644 vllm/entrypoints/tool_server.py diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 9bf4702320..88ef16b87e 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -92,6 +92,7 @@ from vllm.entrypoints.openai.serving_tokenization import ( from vllm.entrypoints.openai.serving_transcription import ( OpenAIServingTranscription, OpenAIServingTranslation) from vllm.entrypoints.openai.tool_parsers import ToolParserManager +from vllm.entrypoints.tool_server import DemoToolServer, ToolServer from vllm.entrypoints.utils import (cli_env_setup, load_aware_call, log_non_default_args, with_cancellation) from vllm.logger import init_logger @@ -1620,6 +1621,11 @@ async def init_app_state( "This discrepancy may lead to performance degradation.", resolved_chat_template, args.model) + if args.tool_server == "demo": + tool_server: Optional[ToolServer] = DemoToolServer() + else: + tool_server = None + # Merge default_mm_loras into the static lora_modules default_mm_loras = (vllm_config.lora_config.default_mm_loras if vllm_config.lora_config is not None else {}) @@ -1654,6 +1660,7 @@ async def init_app_state( return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, + tool_server=tool_server, reasoning_parser=args.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index dfbc9cde3d..12318b300c 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -147,6 +147,10 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" """Special the tool parser plugin write to parse the model-generated tool into OpenAI API format, the name register in this plugin can be used in `--tool-call-parser`.""" + tool_server: Optional[str] = None + """Comma-separated list of host:port pairs (IPv4, IPv6, or hostname). + Examples: 127.0.0.1:8000, [::1]:8000, localhost:1234. Or `demo` for demo + purpose.""" log_config_file: Optional[str] = envs.VLLM_LOGGING_CONFIG_PATH """Path to logging config JSON file for both vllm and uvicorn""" max_log_len: Optional[int] = None diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index f340854386..4ca863fd07 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -29,6 +29,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, # yapf: enable from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.tool_server import ToolServer from vllm.logger import init_logger from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import SamplingParams @@ -53,6 +54,7 @@ class OpenAIServingResponses(OpenAIServing): reasoning_parser: str = "", enable_auto_tools: bool = False, tool_parser: Optional[str] = None, + tool_server: Optional[ToolServer] = None, enable_prompt_tokens_details: bool = False, enable_force_include_usage: bool = False, ) -> None: @@ -114,6 +116,8 @@ class OpenAIServingResponses(OpenAIServing): self.background_tasks: dict[str, asyncio.Task] = {} + self.tool_server = tool_server + async def create_responses( self, request: ResponsesRequest, diff --git a/vllm/entrypoints/tool_server.py b/vllm/entrypoints/tool_server.py new file mode 100644 index 0000000000..769c40e8cc --- /dev/null +++ b/vllm/entrypoints/tool_server.py @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod +from contextlib import AbstractAsyncContextManager, asynccontextmanager +from typing import Any, Optional + +from openai_harmony import ToolNamespaceConfig + +from vllm.entrypoints.tool import HarmonyBrowserTool, HarmonyPythonTool, Tool +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class ToolServer(ABC): + + @abstractmethod + def has_tool(self, tool_name: str) -> bool: + """ + Return True if the tool is supported, False otherwise. + """ + pass + + @abstractmethod + def get_tool_description(self, + tool_name: str) -> Optional[ToolNamespaceConfig]: + """ + Return the tool description for the given tool name. + If the tool is not supported, return None. + """ + pass + + @abstractmethod + def new_session(self, tool_name: str) -> AbstractAsyncContextManager[Any]: + """ + Create a session for the tool. + """ + ... + + +class DemoToolServer(ToolServer): + + def __init__(self): + self.tools: dict[str, Tool] = {} + browser_tool = HarmonyBrowserTool() + if browser_tool.enabled: + self.tools["browser"] = browser_tool + python_tool = HarmonyPythonTool() + if python_tool.enabled: + self.tools["python"] = python_tool + logger.info("DemoToolServer initialized with tools: %s", + list(self.tools.keys())) + + def has_tool(self, tool_name: str) -> bool: + return tool_name in self.tools + + def get_tool_description(self, + tool_name: str) -> Optional[ToolNamespaceConfig]: + if tool_name not in self.tools: + return None + if tool_name == "browser": + return ToolNamespaceConfig.browser() + elif tool_name == "python": + return ToolNamespaceConfig.python() + else: + raise ValueError(f"Unknown tool {tool_name}") + + @asynccontextmanager + async def new_session(self, tool_name: str): + yield self.tools[tool_name] From 5c7cc33f4dafd4949a3f4bda815fa980d71ba45f Mon Sep 17 00:00:00 2001 From: Yongye Zhu Date: Wed, 6 Aug 2025 18:04:04 -0700 Subject: [PATCH 034/932] [gpt-oss] fix model config with hf_config (#22401) Signed-off-by: Yongye Zhu --- vllm/model_executor/models/gpt_oss.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 896560fa24..c37c4e9610 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -61,9 +61,9 @@ class OAIAttention(nn.Module): "original_max_position_embeddings": config.rope_scaling["original_max_position_embeddings"], "beta_fast": - config.rope_ntk_beta, + config.rope_scaling["beta_fast"], "beta_slow": - config.rope_ntk_alpha, + config.rope_scaling["beta_slow"], }, is_neox_style=True, ) @@ -154,7 +154,7 @@ class MLPBlock(torch.nn.Module): dtype=torch.bfloat16) assert config.intermediate_size % self.world_size == 0 self.experts = FusedMoE(num_experts=config.num_local_experts, - top_k=config.num_experts_per_token, + top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, reduce_results=True, From 9a3835aaa9006c0d53628f278319642774d88fbe Mon Sep 17 00:00:00 2001 From: Lain Date: Wed, 6 Aug 2025 18:07:41 -0700 Subject: [PATCH 035/932] Fix trtllm-gen attention env and add attention sink (#22378) Signed-off-by: Siyuan Fu Signed-off-by: Lain Signed-off-by: Yongye Zhu Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Michael Goin Co-authored-by: Yongye Zhu --- vllm/envs.py | 13 ++++--------- vllm/model_executor/models/gpt_oss.py | 5 ++--- vllm/utils/flashinfer.py | 8 ++++---- vllm/v1/attention/backends/flashinfer.py | 17 +++++++++-------- vllm/v1/attention/backends/utils.py | 6 ++---- 5 files changed, 21 insertions(+), 28 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 8a3eb8e509..d9ebf59c1a 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -152,8 +152,7 @@ if TYPE_CHECKING: VLLM_LOOPBACK_IP: str = "" VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False VLLM_ENABLE_RESPONSES_API_STORE: bool = False - VLLM_USE_TRTLLM_CONTEXT_ATTENTION: bool = False - VLLM_USE_TRTLLM_DECODE_ATTENTION: bool = False + VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False @@ -1043,13 +1042,9 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_CUDNN_PREFILL": lambda: bool(int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))), - # If set to 1, use the TRTLLM Context Attention backend in flashinfer. - "VLLM_USE_TRTLLM_CONTEXT_ATTENTION": - lambda: bool(int(os.getenv("VLLM_USE_TRTLLM_CONTEXT_ATTENTION", "0"))), - - # If set to 1, use the TRTLLM Decode Attention backend in flashinfer. - "VLLM_USE_TRTLLM_DECODE_ATTENTION": - lambda: bool(int(os.getenv("VLLM_USE_TRTLLM_DECODE_ATTENTION", "0"))), + # If set to 1, use the TRTLLM attention backend in flashinfer. + "VLLM_USE_TRTLLM_ATTENTION": + lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None), # Controls garbage collection during CUDA graph capture. # If set to 0 (default), enables GC freezing to speed up capture time. diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index c37c4e9610..feb323a045 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -70,9 +70,8 @@ class OAIAttention(nn.Module): tp_size = get_tensor_model_parallel_world_size() - attention_sink_dtype = ( - torch.float32 if envs.VLLM_USE_TRTLLM_CONTEXT_ATTENTION - or envs.VLLM_USE_TRTLLM_DECODE_ATTENTION else torch.bfloat16) + attention_sink_dtype = (torch.float32 if envs.VLLM_USE_TRTLLM_ATTENTION + else torch.bfloat16) self.sinks = torch.nn.Parameter( torch.empty(config.num_attention_heads // tp_size, dtype=attention_sink_dtype, diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index cce1aefaf9..32c52612ca 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -159,7 +159,7 @@ def use_trtllm_attention( # Check if the dimensions are supported by TRTLLM decode attention if (attn_head_size is None or num_qo_heads is None or num_kv_heads is None - or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128): + or num_qo_heads % num_kv_heads != 0): return False env_value = envs.VLLM_USE_TRTLLM_ATTENTION @@ -169,10 +169,10 @@ def use_trtllm_attention( # Making the conditional check for zero because # the path is automatically enabled if the batch size condition # is satisfied. - no_use_trtllm = (env_value == "0") - if not no_use_trtllm: + use_trtllm = (env_value == "1") + if use_trtllm: logger.info_once("Using TRTLLM attention.") - return not no_use_trtllm + return use_trtllm else: # Environment variable not set - use auto-detection use_trtllm = (num_tokens <= 256 and max_seq_len < 131072 diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 061bd5f1d2..1fcb190286 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -215,6 +215,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self._cascade_wrapper = None # Wrapper for cascade attention # Global hyperparameters shared by all attention layers + # TODO: discard this for trtllm-gen backend self.global_hyperparameters = infer_global_hyperparameters( get_per_layer_parameters(vllm_config, layer_names, FlashInferImpl)) @@ -523,16 +524,12 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): head_dim = self.kv_cache_spec.head_size # currently prefill trtllm attention does not support fp8 kv cache - # trtllm may not support sliding window - prefill_use_trtllm = (self.global_hyperparameters.window_left == -1 - and not cache_dtype.startswith("fp8") - and use_trtllm_attention( + prefill_use_trtllm = use_trtllm_attention( num_prefill_tokens, max_seq_len, cache_dtype, - num_qo_heads, num_kv_heads, head_dim)) - decode_use_trtllm = (self.global_hyperparameters.window_left == -1 - and use_trtllm_attention( + num_qo_heads, num_kv_heads, head_dim) + decode_use_trtllm = use_trtllm_attention( num_decode_tokens, max_seq_len, cache_dtype, - num_qo_heads, num_kv_heads, head_dim)) + num_qo_heads, num_kv_heads, head_dim) attn_metadata = FlashInferMetadata( num_actual_tokens=num_actual_tokens, @@ -793,6 +790,8 @@ class FlashInferImpl(AttentionImpl): batch_size=attn_metadata.num_prefills, cum_seq_lens_q=attn_metadata.qo_indptr_gpu, cum_seq_lens_kv=attn_metadata.paged_kv_indptr_gpu, + window_left=window_left, + sinks=self.sinks, out=output[num_decode_tokens:], ) @@ -839,6 +838,8 @@ class FlashInferImpl(AttentionImpl): max_seq_len=attn_metadata.max_seq_len, bmm1_scale=layer._k_scale_float * self.scale, bmm2_scale=layer._v_scale_float, + window_left=window_left, + sinks=self.sinks, out=output[:num_decode_tokens], ) return output_padded diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index f521d94331..770c14572f 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -254,8 +254,7 @@ def get_kv_cache_layout(): # Override with format specified by the user. cache_layout = envs.VLLM_KV_CACHE_LAYOUT if cache_layout is None: - if (envs.VLLM_USE_TRTLLM_CONTEXT_ATTENTION - or envs.VLLM_USE_TRTLLM_DECODE_ATTENTION): + if envs.VLLM_USE_TRTLLM_ATTENTION: cache_layout = "HND" else: cache_layout = get_kv_connector_cache_layout() @@ -333,8 +332,7 @@ def infer_global_hyperparameters( global_params = param_sets[0] # trtllm attention doesn't need global hyper params so disable the check - if (not envs.VLLM_USE_TRTLLM_CONTEXT_ATTENTION - and not envs.VLLM_USE_TRTLLM_DECODE_ATTENTION): + if not envs.VLLM_USE_TRTLLM_ATTENTION: for params in param_sets: if params.window_left != global_params.window_left: raise ValueError( From e8961e963a76feb3e2c080220e79d2d5a9d272f9 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 6 Aug 2025 21:10:24 -0400 Subject: [PATCH 036/932] Update `flashinfer-python==0.2.10` (#22389) Signed-off-by: mgoin --- docker/Dockerfile | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index d444087a3e..04a63f5d68 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -392,7 +392,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" # Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt # We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel. -ARG FLASHINFER_GIT_REF="v0.2.9" +ARG FLASHINFER_GIT_REF="v0.2.10" RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' . /etc/environment git clone --depth 1 --recursive --shallow-submodules \ diff --git a/setup.py b/setup.py index c6f4985c59..e374fcb816 100644 --- a/setup.py +++ b/setup.py @@ -665,7 +665,7 @@ setup( "mistral_common[audio]"], # Required for audio processing "video": [], # Kept for backwards compatibility # FlashInfer should be updated together with the Dockerfile - "flashinfer": ["flashinfer-python==0.2.9"], + "flashinfer": ["flashinfer-python==0.2.10"], }, cmdclass=cmdclass, package_data=package_data, From 41b67f4263e6ee06cfb5e74073970e2cee854d5e Mon Sep 17 00:00:00 2001 From: tc-mb <157115220+tc-mb@users.noreply.github.com> Date: Thu, 7 Aug 2025 09:35:46 +0800 Subject: [PATCH 037/932] [model] Support MiniCPM-V 4.0 (#22166) Co-authored-by: imning3 --- docs/models/supported_models.md | 2 +- tests/models/registry.py | 2 +- vllm/model_executor/models/minicpmv.py | 148 +++++++++++++++++++++++-- 3 files changed, 140 insertions(+), 12 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 3816412268..265643a441 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -622,7 +622,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ | | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I+ + V+ | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ | | `MiniCPMO` | MiniCPM-O | T + IE+ + VE+ + AE+ | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ | +| `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, etc. | ✅︎ | | ✅︎ | | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + IE+ | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ | | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ | | `MllamaForConditionalGeneration` | Llama 3.2 | T + I+ | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | | diff --git a/tests/models/registry.py b/tests/models/registry.py index 69961d7385..2c2d094e04 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -427,7 +427,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6", trust_remote_code=True), "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5", - extras={"2.6": "openbmb/MiniCPM-V-2_6"}, # noqa: E501 + extras={"2.6": "openbmb/MiniCPM-V-2_6", "4.0": "openbmb/MiniCPM-V-4"}, # noqa: E501 trust_remote_code=True), "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501 trust_remote_code=True, diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index e172758b2f..3aa16bb9ab 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -38,6 +38,8 @@ from typing_extensions import TypeVar from vllm.config import VllmConfig from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.awq import AWQConfig +from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2, get_2d_sincos_pos_embed) from vllm.model_executor.model_loader.utils import set_default_torch_dtype @@ -339,7 +341,9 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: mm_limits = {"image": None} - if self.get_model_version() == (2, 6): + if self.get_model_version() == (2, + 6) or self.get_model_version() == (4, + 0): mm_limits["video"] = None return mm_limits @@ -620,7 +624,8 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): out_keys: set[str], ) -> dict[str, NestedTensors]: # This processor supports zipping prompt and mm_data together - if self.info.get_model_version() == (2, 6): + if self.info.get_model_version() == ( + 2, 6) or self.info.get_model_version() == (4, 0): inputs = super()._call_hf_processor( prompt=prompts, # type: ignore mm_data=mm_data, @@ -679,10 +684,18 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> Sequence[PromptUpdate]: - placeholder = { - "image": self.info.image_pattern, - "video": self.info.video_pattern, - } + placeholders = [("image", self.info.image_pattern), + ("video", self.info.video_pattern)] + + # hard code for inconsistency of encode-decode image_pattern + additional_placeholders = [] + tokenizer = self.info.get_tokenizer() + for modality, pattern in placeholders: + sub_pattern = tokenizer.decode( + tokenizer.encode(pattern, add_special_tokens=False)) + if sub_pattern != pattern: + additional_placeholders.append((modality, sub_pattern)) + placeholders += additional_placeholders def get_image_replacement(item_idx: int): images = mm_items.get_items( @@ -714,9 +727,9 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): return [ PromptReplacement(modality=modality, - target=placeholder[modality], + target=pattern, replacement=get_replacement[modality]) - for modality in ("image", "video") + for modality, pattern in placeholders ] def _get_mm_fields_config( @@ -1262,11 +1275,124 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA): return self.resampler(vision_embedding, tgt_sizes) + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self, + skip_prefixes=["apm.", "audio", "tts"]) + return loader.load_weights(weights) + + +class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + assert self.version == (4, 0) + + def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): + if isinstance(quant_config, (AWQConfig, AWQMarlinConfig)): + return None + return quant_config + + def init_llm( + self, + vllm_config: VllmConfig, + prefix: str = "", + ) -> nn.Module: + return LlamaForCausalLM(vllm_config=vllm_config, prefix=prefix) + + def init_vision_module( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> nn.Module: + quant_config = self._maybe_ignore_quant_config(quant_config) + model = Idefics2VisionTransformer(config.vision_config, + quant_config=quant_config, + prefix=prefix) + if self.config.drop_vision_last_layer: + model.encoder.layers = model.encoder.layers[:-1] + return model + + def init_resampler( + self, + embed_dim: int, + vision_dim: int, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> nn.Module: + quant_config = self._maybe_ignore_quant_config(quant_config) + with set_default_torch_dtype(torch.float16): + # The resampler in 4.0 remains consistent with the one in 2.5/2.6. + resampler = Resampler2_5(num_queries=self.config.query_num, + embed_dim=embed_dim, + num_heads=embed_dim // 128, + kv_dim=vision_dim, + quant_config=quant_config, + prefix=prefix) + + return resampler.to(device=current_platform.device_type, + dtype=torch.get_default_dtype()) + + def get_vision_hidden_states( + self, data: MiniCPMVImagePixelInputs) -> torch.Tensor: + pixel_values = data["pixel_values"] + tgt_sizes = data["tgt_sizes"] + + B = len(pixel_values) + P = pixel_values[0].shape[-2] + L = max(item.shape[-1] for item in pixel_values) + device = pixel_values[0].device + dtype = pixel_values[0].dtype + + all_pixel_values = torch.zeros((B, 3, P, L), + dtype=dtype, + device=device) + for i, pixel_values_item in enumerate(pixel_values): + L_item = pixel_values_item.shape[-1] + all_pixel_values[i, ..., :L_item] = pixel_values_item + + num_patches = tgt_sizes.prod(-1) + max_patches = num_patches.max().item() + assert isinstance(max_patches, int) + + patch_attn_mask = torch.zeros((B, max_patches), + dtype=torch.bool, + device=device) + for i, num_patches_item in enumerate(num_patches): + patch_attn_mask[i, :num_patches_item] = True + + vision_embedding = self.vpm( + all_pixel_values, + patch_attention_mask=patch_attn_mask.unsqueeze(1), + tgt_sizes=tgt_sizes, + ) + + return self.resampler(vision_embedding, tgt_sizes) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self, + skip_prefixes=["apm.", "audio", "tts"]) + return loader.load_weights(weights) + _SUPPORT_VERSION = { (2, 0): MiniCPMV2_0, (2, 5): MiniCPMV2_5, (2, 6): MiniCPMV2_6, + (4, 0): MiniCPMV4_0, } @@ -1294,8 +1420,10 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA): # Dispatch class based on version instance_cls = _SUPPORT_VERSION.get(version) if instance_cls is None: - raise ValueError( - "Currently, MiniCPMV only supports versions 2.0, 2.5, and 2.6") + supported_versions = ", ".join( + [f"{v[0]}.{v[1]}" for v in sorted(_SUPPORT_VERSION.keys())]) + raise ValueError(f"Currently, MiniCPMV only supports versions " + f"{supported_versions}. Got version: {version}") # quant_config references base class members, # so update values before init is called From f825c6bd22133a8b2242457069f59654a2ae401b Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Wed, 6 Aug 2025 22:37:14 -0300 Subject: [PATCH 038/932] Support encoder_only attention for FlexAttention (#22273) Signed-off-by: Max de Bayser --- tests/kernels/test_flex_attention.py | 88 +++++++++++++----- vllm/v1/attention/backends/flex_attention.py | 95 ++++++++++++++------ 2 files changed, 137 insertions(+), 46 deletions(-) diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py index e25556c89f..f76bd19246 100644 --- a/tests/kernels/test_flex_attention.py +++ b/tests/kernels/test_flex_attention.py @@ -9,7 +9,9 @@ import pytest import torch from packaging import version -from vllm import LLM, SamplingParams +from vllm import SamplingParams + +from ..models.utils import check_embeddings_close TORCH_VERSION = version.parse(torch.__version__) MINIMUM_TORCH_VERSION = version.parse("2.7.0") @@ -28,7 +30,7 @@ def set_seed(seed): not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION, reason="CUDA not available or PyTorch version < 2.7", ) -def test_flex_attention_vs_default_backend(monkeypatch): +def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): """Test that FlexAttention produces the same outputs as the default backend. This test compares the outputs from the FlexAttention backend with @@ -36,7 +38,7 @@ def test_flex_attention_vs_default_backend(monkeypatch): """ model_name = "Qwen/Qwen2.5-1.5B-Instruct" seed = 42 - max_tokens = 32 + max_tokens = 24 prompts = [ "Hello, my name is", "The president of the United States is", @@ -54,33 +56,30 @@ def test_flex_attention_vs_default_backend(monkeypatch): m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") set_seed(seed) - - llm_flex = LLM( - model_name, - tensor_parallel_size=1, - num_gpu_blocks_override=128, - enforce_eager=True, - ) - output_flex = llm_flex.generate(prompts, sampling_params) + with vllm_runner(model_name, + runner="generate", + tensor_parallel_size=1, + num_gpu_blocks_override=128, + enforce_eager=True) as llm_flex: + output_flex = llm_flex.generate(prompts, sampling_params) # Run with default backend with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") set_seed(seed) - llm_default = LLM( - model_name, - tensor_parallel_size=1, - num_gpu_blocks_override=128, - enforce_eager=True, - ) - output_default = llm_default.generate(prompts, sampling_params) + with vllm_runner(model_name, + runner="generate", + tensor_parallel_size=1, + num_gpu_blocks_override=128, + enforce_eager=True) as llm_default: + output_default = llm_default.generate(prompts, sampling_params) # Compare outputs from both backends for i, (flex_result, default_result) in enumerate(zip(output_flex, output_default)): prompt = prompts[i] - flex_text = flex_result.outputs[0].text - default_text = default_result.outputs[0].text + flex_text = flex_result[1][0] + default_text = default_result[1][0] assert flex_text == default_text, ( f"FlexAttention output doesn't match default for: {prompt!r}\n" @@ -88,5 +87,54 @@ def test_flex_attention_vs_default_backend(monkeypatch): f"Default: {default_text!r}") +@pytest.mark.skipif( + not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION, + reason="CUDA not available or PyTorch version < 2.7", +) +def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch): + """Test that FlexAttention produces the same outputs as the default backend. + + This test compares the outputs from the FlexAttention backend with + the default backend for encoder models. + """ + model_name = "BAAI/bge-base-en-v1.5" + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + ] + + # Run with flex attention + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") + with vllm_runner(model_name, + runner="pooling", + dtype=torch.bfloat16, + tensor_parallel_size=1, + max_model_len=100, + enforce_eager=True) as llm_flex: + flex_outputs = llm_flex.embed(prompts) + + # Run with default backend + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + with vllm_runner(model_name, + runner="pooling", + dtype=torch.bfloat16, + tensor_parallel_size=1, + max_model_len=100, + enforce_eager=True) as llm_default: + default_outputs = llm_default.embed(prompts) + + check_embeddings_close( + embeddings_0_lst=flex_outputs, + embeddings_1_lst=default_outputs, + name_0="flex", + name_1="default", + tol=1e-2, + ) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index bb0d890c77..e599411b2d 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -148,6 +148,7 @@ def causal_mask_mod(b: torch.Tensor, h: torch.Tensor, q_idx: torch.Tensor, @dataclass class FlexAttentionMetadata: + causal: bool num_actual_tokens: int # Number of tokens excluding padding. max_query_len: int query_start_loc: torch.Tensor @@ -177,10 +178,9 @@ class FlexAttentionMetadata: num_blocks = 0 block_mask: Optional[BlockMask] = None score_mod: Optional[_score_mod_signature] = None - mask_mod: Optional[_mask_mod_signature] = None logical_mask_mod: _mask_mod_signature = causal_mask_mod - def get_mask_mod(self) -> _mask_mod_signature: + def get_causal_mask_mod(self) -> _mask_mod_signature: """Creates the mask_mod function for FlexAttention. This function creates the combined mask mod function that handles: @@ -233,14 +233,39 @@ class FlexAttentionMetadata: return final_mask_mod + def get_bidirectional_mask_mod(self) -> _mask_mod_signature: + """Creates the encoder mask_mod function for FlexAttention. + + Since the encoder bidirectional attention doesn't run with + KV cache, this function creates a mask based on the + packed query sequences. + """ + # Create a lookup mapping from query indices -> request number + request_lookup = _offsets_to_doc_ids_tensor(self.query_start_loc) + + def final_mask_mod( + b: torch.Tensor, + h: torch.Tensor, + q_idx: torch.Tensor, + kv_idx: torch.Tensor, + ) -> torch.Tensor: + return request_lookup[q_idx] == request_lookup[kv_idx] + + return final_mask_mod + def build_block_mask(self) -> BlockMask: - assert self.mask_mod is not None + if self.causal: + mask_mod = self.get_causal_mask_mod() + kv_len = self.total_cache_tokens + else: + mask_mod = self.get_bidirectional_mask_mod() + kv_len = self.num_actual_tokens return create_block_mask_compiled( - self.mask_mod, + mask_mod, None, None, self.num_actual_tokens, - self.total_cache_tokens, + kv_len, device=self.block_table.device, ) @@ -251,7 +276,6 @@ class FlexAttentionMetadata: assert self.prefix_kv_lens is None, "Not implemented yet." assert self.suffix_kv_lens is None, "Not implemented yet." self.num_blocks = self.total_cache_tokens // self.block_size - self.mask_mod = self.get_mask_mod() self.block_mask = self.build_block_mask() @@ -306,6 +330,7 @@ class FlexAttentionMetadataBuilder( self.device, non_blocking=True) out = FlexAttentionMetadata( + causal=common_attn_metadata.causal, num_actual_tokens=num_actual_tokens, max_query_len=max_query_len, query_start_loc=query_start_loc, @@ -350,6 +375,12 @@ class FlexAttentionImpl(AttentionImpl): self.head_size = head_size self.scale = float(scale) self.num_kv_heads = num_kv_heads + self.attn_type = attn_type + + if attn_type not in (AttentionType.ENCODER_ONLY, + AttentionType.DECODER): + raise NotImplementedError( + f"FlexAttention does not support {attn_type} attention") if alibi_slopes is not None: raise NotImplementedError( @@ -425,26 +456,38 @@ class FlexAttentionImpl(AttentionImpl): num_actual_tokens = attn_metadata.num_actual_tokens - key_cache, value_cache = kv_cache.unbind(0) + if not attn_metadata.causal: + assert self.attn_type == AttentionType.ENCODER_ONLY - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) + query, key_tensor, value_tensor = map( + lambda x: self.view_as_4d(x).permute(0, 2, 1, 3), + (query, key, value), + ) + + else: + assert self.attn_type == AttentionType.DECODER + key_cache, value_cache = kv_cache.unbind(0) + + torch.ops._C_cache_ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) + + # View out the block_size dim + key_cache = key_cache.view(-1, self.num_kv_heads, self.head_size) + value_cache = value_cache.view(-1, self.num_kv_heads, + self.head_size) + query, key_tensor, value_tensor = map( + lambda x: self.view_as_4d(x).permute(0, 2, 1, 3), + (query, key_cache, value_cache), + ) - # View out the block_size dim - key_cache = key_cache.view(-1, self.num_kv_heads, self.head_size) - value_cache = value_cache.view(-1, self.num_kv_heads, self.head_size) - query, key_cache, value_cache = map( - lambda x: self.view_as_4d(x).permute(0, 2, 1, 3), - (query, key_cache, value_cache), - ) query = query[:, :, :num_actual_tokens, :] # Doesn't work for now -> constraint violation # torch._dynamo.try_mark_dynamic(query, 2) @@ -465,8 +508,8 @@ class FlexAttentionImpl(AttentionImpl): out = flex_attention_compiled( query, - key_cache, - value_cache, + key_tensor, + value_tensor, attn_metadata.score_mod, attn_metadata.block_mask, self.scale, From 1dc8a70b6d4e8ba4e139f1ddb86a166694f42f21 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 6 Aug 2025 21:40:52 -0400 Subject: [PATCH 039/932] [Attention] Support multiple attention metadata builders per kv_cache_spec + proper local attention no hybrid kv cache fix (#21588) Signed-off-by: Lucas Wilkinson --- tests/v1/spec_decode/test_eagle.py | 3 +- tests/v1/worker/test_gpu_model_runner.py | 6 +- vllm/attention/backends/abstract.py | 4 + vllm/attention/layer.py | 36 +- .../layers/chunked_local_attention.py | 88 +++++ vllm/attention/selector.py | 2 +- vllm/model_executor/models/llama4.py | 10 +- vllm/v1/attention/backends/utils.py | 48 ++- vllm/v1/spec_decode/eagle.py | 9 +- vllm/v1/worker/cpu_model_runner.py | 8 +- vllm/v1/worker/gpu_model_runner.py | 342 +++++++++--------- vllm/v1/worker/tpu_model_runner.py | 5 +- vllm/v1/worker/utils.py | 21 ++ 13 files changed, 369 insertions(+), 213 deletions(-) create mode 100644 vllm/attention/layers/chunked_local_attention.py diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 05f6dd40a9..73b47f8974 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -313,7 +313,8 @@ def test_propose(num_speculative_tokens, backend): # Mock runner for attention metadata building proposer.runner = mock.MagicMock() - proposer.runner.attn_metadata_builders = [attn_metadata_builder] + proposer.runner.attn_groups.append([mock.MagicMock()]) + proposer.runner.attn_groups[0][0].metadata_builder = attn_metadata_builder result = proposer.propose(target_token_ids=target_token_ids, target_positions=target_positions, diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 231dfcbb68..e151d388c2 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -417,12 +417,12 @@ def test_kv_cache_stride_order(monkeypatch, model_runner): return rnd_stride # Patch the attention backend class and re-trigger the KV cache creation. - for attn_backend in model_runner.attn_backends: + for attn_group in model_runner._attn_group_iterator(): + attn_backend = attn_group.backend monkeypatch.setattr(attn_backend, "get_kv_cache_stride_order", rnd_stride_order) - model_runner.attn_backends = [] - model_runner.attn_metadata_builders = [] + model_runner.attn_groups = [] model_runner.initialize_kv_cache(model_runner.kv_cache_config) # Shape is unchanged, but layout may differ diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index ba20da4fd7..2417fe06a6 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -106,6 +106,10 @@ class AttentionBackend(ABC): block_size: int, num_seqs: int, num_queries: int) -> None: raise NotImplementedError + @classmethod + def full_cls_name(cls) -> tuple[str, str]: + return (cls.__module__, cls.__qualname__) + @dataclass class AttentionMetadata: diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 178453ecdc..b4c3cbd7c9 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -9,6 +9,7 @@ import torch.nn.functional as F import vllm.envs as envs from vllm.attention import AttentionType +from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.selector import backend_name_to_enum, get_attn_backend from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target from vllm.config import CacheConfig, get_current_vllm_config @@ -80,6 +81,7 @@ class Attention(nn.Module): prefix: str = "", attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, + attn_backend: Optional[type[AttentionBackend]] = None, **extra_impl_args, ) -> None: """ @@ -137,15 +139,6 @@ class Attention(nn.Module): self.num_kv_heads = num_kv_heads self.sliding_window = sliding_window - # For v1 we have backend agnostic iRoPE (local chunked attention) - # we have to store the flag on the layer so gpu model runner can - # set KVSpec appropriately (and pop it so it doesnt get passed to - # the backends) - if envs.VLLM_USE_V1: - self.use_irope = extra_impl_args.pop("use_irope", False) - else: - self.use_irope = extra_impl_args.get("use_irope", False) - quant_method = quant_config.get_quant_method( self, prefix=prefix) if quant_config else None if quant_method is not None and not isinstance( @@ -166,18 +159,22 @@ class Attention(nn.Module): # During model initialization, the default dtype is set as the model # weight and activation dtype. dtype = torch.get_default_dtype() - attn_backend = get_attn_backend(head_size, - dtype, - kv_cache_dtype, - block_size, - is_attention_free, - use_mla=use_mla) - impl_cls = attn_backend.get_impl_cls() + if attn_backend is None: + self.attn_backend = get_attn_backend(head_size, + dtype, + kv_cache_dtype, + block_size, + is_attention_free, + use_mla=use_mla) + else: + self.attn_backend = attn_backend + + impl_cls = self.attn_backend.get_impl_cls() self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, logits_soft_cap, attn_type, kv_sharing_target_layer_name, **extra_impl_args) - self.backend = backend_name_to_enum(attn_backend.get_name()) + self.backend = backend_name_to_enum(self.attn_backend.get_name()) self.dtype = dtype # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how @@ -187,7 +184,7 @@ class Attention(nn.Module): self.use_direct_call = not current_platform.is_cuda_alike( ) and not current_platform.is_cpu() - self.use_output = attn_backend.accept_output_buffer + self.use_output = self.attn_backend.accept_output_buffer compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: raise ValueError(f"Duplicate layer name: {prefix}") @@ -309,6 +306,9 @@ class Attention(nn.Module): if hasattr(self.impl, "process_weights_after_loading"): self.impl.process_weights_after_loading(act_dtype) + def get_attn_backend(self) -> type[AttentionBackend]: + return self.attn_backend + class MultiHeadAttention(nn.Module): """Multi-headed attention without any cache, used for ViT.""" diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py new file mode 100644 index 0000000000..892077ba91 --- /dev/null +++ b/vllm/attention/layers/chunked_local_attention.py @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools +from typing import List, Optional + +import torch + +from vllm import envs +from vllm.attention.backends.abstract import AttentionBackend +from vllm.attention.selector import get_attn_backend +from vllm.config import CacheConfig, QuantizationConfig +from vllm.v1.attention.backends.utils import ( + CommonAttentionMetadata, make_local_attention_virtual_batches, + subclass_attention_backend, subclass_attention_metadata_builder) + +from ..layer import Attention + + +@functools.lru_cache +def create_chunked_local_attention_backend( + underlying_attn_backend: AttentionBackend, + attention_chunk_size: int, + block_size: int, +) -> type[AttentionBackend]: + prefix = f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_" + + def build_preprocess_fn(cm: CommonAttentionMetadata): + return make_local_attention_virtual_batches(attention_chunk_size, cm, + block_size) + + # Dynamically create a new attention backend that wraps the + # underlying attention backend but applies + # `make_local_attention_virtual_batches` before calling `build(...)` + builder_cls = subclass_attention_metadata_builder( + name_prefix=prefix, + builder_cls=underlying_attn_backend.get_builder_cls(), + build_preprocess_fn=build_preprocess_fn) + attn_backend = subclass_attention_backend( + name_prefix=prefix, + attention_backend_cls=underlying_attn_backend, + builder_cls=builder_cls) + + return attn_backend + + +class ChunkedLocalAttention(Attention): + + def __init__(self, + num_heads: int, + head_size: int, + scale: float, + attention_chunk_size: int, + num_kv_heads: Optional[int] = None, + alibi_slopes: Optional[List[float]] = None, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + kv_sharing_target_layer_name: Optional[str] = None, + prefix: str = ""): + dtype = torch.get_default_dtype() + if cache_config is not None: + kv_cache_dtype = cache_config.cache_dtype + block_size = cache_config.block_size + else: + kv_cache_dtype = "auto" + block_size = 16 + + if envs.VLLM_USE_V1: + underlying_attn_backend = get_attn_backend(head_size, dtype, + kv_cache_dtype, + block_size) + + attn_backend = create_chunked_local_attention_backend( + underlying_attn_backend, attention_chunk_size, block_size) + else: + # in v0 the local attention is handled inside the backends + attn_backend = None + + super().__init__( + num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + alibi_slopes=alibi_slopes, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, + kv_sharing_target_layer_name=kv_sharing_target_layer_name, + attn_backend=attn_backend) diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 596c556e54..508470bb36 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -142,7 +142,7 @@ def get_attn_backend( dtype: torch.dtype, kv_cache_dtype: Optional[str], block_size: int, - is_attention_free: bool, + is_attention_free: bool = False, use_mla: bool = False, ) -> type[AttentionBackend]: """Selects which attention backend to use and lazily imports it.""" diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 60098209c3..1f8b9d0744 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -25,6 +25,7 @@ from torch import nn from transformers import Llama4TextConfig from vllm.attention import Attention +from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size @@ -194,17 +195,18 @@ class Llama4Attention(nn.Module): is_neox_style=is_neox_style, ) if not self.nope else None - self.attn = Attention( + attn_cls = Attention if self.nope else ChunkedLocalAttention + self.attn = attn_cls( self.num_heads, self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, - per_layer_sliding_window=None, - use_irope=not self.nope, prefix=f"{prefix}.attn", - ) + **({ + "attention_chunk_size": config.attention_chunk_size + } if not self.nope else {})) def _get_attn_scale(self, positions: torch.Tensor) -> torch.Tensor: floor = torch.floor((positions + 1.0) / self.floor_scale) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 770c14572f..e23dd8bc5b 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -5,12 +5,12 @@ import enum import functools from abc import abstractmethod from dataclasses import dataclass, make_dataclass -from typing import TYPE_CHECKING, Any, ClassVar, Generic, Optional, TypeVar +from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Generic, Optional, + TypeVar) import numpy as np import torch -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.utils import cdiv @@ -20,6 +20,8 @@ if TYPE_CHECKING: from vllm.v1.worker.gpu_input_batch import InputBatch import vllm.envs as envs +from vllm.attention.backends.abstract import AttentionBackend +from vllm.attention.layer import Attention from vllm.distributed.kv_transfer.kv_connector.utils import ( get_kv_connector_cache_layout) from vllm.logger import init_logger @@ -532,6 +534,48 @@ def make_local_attention_virtual_batches( ) +def subclass_attention_metadata_builder( + name_prefix: str, + builder_cls: type[AttentionMetadataBuilder[M]], + build_preprocess_fn: Callable[[CommonAttentionMetadata], + CommonAttentionMetadata], +) -> type[AttentionMetadataBuilder[M]]: + """ + Return a new subclass of `builder_cls` whose .build(...) method + first calls build_preprocess_fn(common_attn_metadata) on the metadata. + """ + name: str = name_prefix + builder_cls.__name__ # type: ignore + + def build(self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False): + return builder_cls.build(self, common_prefix_len, + build_preprocess_fn(common_attn_metadata), + fast_build) + + Wrapped = type( + name, + (builder_cls, ), # inherit from the original + { + "build": build, + }) + return Wrapped # type: ignore + + +def subclass_attention_backend( + name_prefix: str, attention_backend_cls: type[AttentionBackend], + builder_cls: type[AttentionMetadataBuilder[M]] +) -> type[AttentionBackend]: + """ + Return a new subclass where `get_builder_cls` returns `builder_cls`. + """ + name: str = name_prefix + attention_backend_cls.__name__ # type: ignore + + return type(name, (attention_backend_cls, ), + {"get_builder_cls": lambda: builder_cls}) + + def split_decodes_and_prefills( common_attn_metadata: CommonAttentionMetadata, decode_threshold: int = 1, diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index b2380bb3dd..3c36971fe5 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -158,9 +158,9 @@ class EagleProposer: assert self.runner is not None # FIXME: need to consider multiple kv_cache_groups - attn_metadata = self.runner.attn_metadata_builders[ - 0].build_for_drafting(common_attn_metadata=common_attn_metadata, - draft_index=0) + attn_metadata = self.runner.attn_groups[0][0].metadata_builder\ + .build_for_drafting(common_attn_metadata=common_attn_metadata, + draft_index=0) # At this moment, we assume all eagle layers belong to the same KV # cache group, thus using the same attention metadata. @@ -349,7 +349,8 @@ class EagleProposer: hidden_states: torch.Tensor, common_attn_metadata: CommonAttentionMetadata, ) -> list[torch.Tensor]: - tree_attn_metadata_builder = self.runner.attn_metadata_builders[0] + tree_attn_metadata_builder = \ + self.runner.attn_groups[0][0].metadata_builder assert isinstance(tree_attn_metadata_builder, TreeAttentionMetadataBuilder) diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index d8f3e0d89a..11b96d9463 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -53,11 +53,11 @@ class CPUModelRunner(GPUModelRunner): raise ValueError("Multiple KVCacheGroups is not" "currently supported with CPU model runner.") - assert type( - self.attn_metadata_builders[0]) is TorchSDPAMetadataBuilderV1 + assert type(self.attn_groups[0] + [0].metadata_builder) is TorchSDPAMetadataBuilderV1 - self.attn_metadata_builders[0].reorder_batch(self.input_batch, - scheduler_output) + self.attn_groups[0][0].metadata_builder.reorder_batch( + self.input_batch, scheduler_output) def _postprocess_tenosrs(self) -> None: # Note: replace device tensors with cpu tensors diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 549f21af79..08b253dcdb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3,7 +3,10 @@ import dataclasses import gc +import itertools import time +from collections import defaultdict +from collections.abc import Iterator from contextlib import contextmanager from typing import TYPE_CHECKING, Any, Optional, Union, cast @@ -14,9 +17,9 @@ import torch.nn as nn from tqdm import tqdm import vllm.envs as envs -from vllm.attention import AttentionType, get_attn_backend +from vllm.attention import Attention, AttentionType from vllm.attention.backends.abstract import AttentionBackend -from vllm.attention.layer import Attention +from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention from vllm.compilation.counter import compilation_counter from vllm.config import (CompilationLevel, VllmConfig, get_layers_from_vllm_config, update_config) @@ -50,7 +53,6 @@ from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, make_kv_sharing_fast_prefill_attention_metadata, - make_local_attention_virtual_batches, reorder_batch_to_split_decodes_and_prefills) from vllm.v1.kv_cache_interface import (AttentionSpec, ChunkedLocalAttentionSpec, @@ -73,8 +75,8 @@ from vllm.v1.worker.kv_connector_model_runner_mixin import ( from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin from ..sample.logits_processor import LogitsProcessorManager -from .utils import (MultiModalBudget, bind_kv_cache, gather_mm_placeholders, - initialize_kv_cache_for_kv_sharing, +from .utils import (AttentionGroup, MultiModalBudget, bind_kv_cache, + gather_mm_placeholders, initialize_kv_cache_for_kv_sharing, sanity_check_mm_encoder_outputs, scatter_mm_placeholders) if TYPE_CHECKING: @@ -162,8 +164,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # self.model: nn.Module # Set after load_model # Initialize in initialize_kv_cache self.kv_caches: list[torch.Tensor] = [] - self.attn_metadata_builders: list[AttentionMetadataBuilder] = [] - self.attn_backends: list[type[AttentionBackend]] = [] + # indexes: [kv_cache_group_id][attn_group] + self.attn_groups: list[list[AttentionGroup]] = [] # self.kv_cache_config: KVCacheConfig # req_id -> (input_id -> encoder_output) @@ -830,81 +832,51 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): spec_decode_common_attn_metadata is None: spec_decode_common_attn_metadata = common_attn_metadata - if isinstance(kv_cache_group_spec.kv_cache_spec, - ChunkedLocalAttentionSpec): - common_attn_metadata = make_local_attention_virtual_batches( - kv_cache_group_spec.kv_cache_spec.attention_chunk_size, - common_attn_metadata, self.cache_config.block_size) + for attn_group in self.attn_groups[kv_cache_group_id]: + # Prepare for cascade attention if enabled & beneficial. + common_prefix_len = 0 + builder = attn_group.metadata_builder + if self.cascade_attn_enabled: + common_prefix_len = self._compute_cascade_attn_prefix_len( + num_scheduled_tokens, + scheduler_output. + num_common_prefix_blocks[kv_cache_group_id], + kv_cache_group_spec.kv_cache_spec, + builder, + ) - # Prepare for cascade attention if enabled & beneficial. - common_prefix_len = 0 - builder = self.attn_metadata_builders[kv_cache_group_id] - if self.cascade_attn_enabled: - common_prefix_len = self._compute_cascade_attn_prefix_len( - num_scheduled_tokens, - scheduler_output. - num_common_prefix_blocks[kv_cache_group_id], - kv_cache_group_spec.kv_cache_spec, - builder, - ) - - attn_metadata_i = (builder.build( - common_prefix_len=common_prefix_len, - common_attn_metadata=common_attn_metadata, - )) - - fast_prefill_metadata = attn_metadata_i - if (self.cache_config.kv_sharing_fast_prefill - and self.kv_sharing_fast_prefill_eligible_layers): - # Dynamically create a a dataclass type that inherits - # from attention metadata type but includes additional - # fields logits_indices_padded and num_logits_indices - # which are required for prefill truncation - fast_prefill_metadata_type = ( - make_kv_sharing_fast_prefill_attention_metadata( - metadata_cls=type(attn_metadata_i), )) - fast_prefill_metadata = fast_prefill_metadata_type( - **dataclasses.asdict(attn_metadata_i), - logits_indices_padded=logits_indices_padded, - num_logits_indices=logits_indices.size(0), - ) - - for layer_name in kv_cache_group_spec.layer_names: - if (self.cache_config.kv_sharing_fast_prefill and layer_name - in self.kv_sharing_fast_prefill_eligible_layers): - attn_metadata[layer_name] = fast_prefill_metadata - continue - - attn_metadata[layer_name] = attn_metadata_i - - # Hack for now to fix chunked local attention + no hybrid kv cache - # manager we can remove this once - # https://github.com/vllm-project/vllm/pull/21588 - # is merged (i.e. properly handle different attention backends for - # the same kv_cache_spec) - if self.attention_chunk_size is not None \ - and self.scheduler_config.disable_hybrid_kv_cache_manager: - if not hasattr(self, "local_attention_layers"): - self.local_attention_layers = [] - attn_layers = get_layers_from_vllm_config( - self.vllm_config, Attention) - for layer_name, attn_module in attn_layers.items(): - if attn_module.use_irope: - self.local_attention_layers.append(layer_name) - - local_attn_metadata_i = (builder.build( - common_prefix_len=0, - common_attn_metadata=make_local_attention_virtual_batches( - self.attention_chunk_size, common_attn_metadata, - self.cache_config.block_size), + attn_metadata_i = (builder.build( + common_prefix_len=common_prefix_len, + common_attn_metadata=common_attn_metadata, )) - for layer_name in self.local_attention_layers: - attn_metadata[layer_name] = local_attn_metadata_i + fast_prefill_metadata = attn_metadata_i + if (self.cache_config.kv_sharing_fast_prefill + and self.kv_sharing_fast_prefill_eligible_layers): + # Dynamically create a a dataclass type that inherits + # from attention metadata type but includes additional + # fields logits_indices_padded and num_logits_indices + # which are required for prefill truncation + fast_prefill_metadata_type = ( + make_kv_sharing_fast_prefill_attention_metadata( + metadata_cls=type(attn_metadata_i), )) + fast_prefill_metadata = fast_prefill_metadata_type( + **dataclasses.asdict(attn_metadata_i), + logits_indices_padded=logits_indices_padded, + num_logits_indices=logits_indices.size(0), + ) + + for layer_name in attn_group.layer_names: + if (self.cache_config.kv_sharing_fast_prefill + and layer_name + in self.kv_sharing_fast_prefill_eligible_layers): + attn_metadata[layer_name] = fast_prefill_metadata + continue + attn_metadata[layer_name] = attn_metadata_i attention_cuda_graphs = all( - b.can_run_in_cudagraph(common_attn_metadata) - for b in self.attn_metadata_builders) + g.metadata_builder.can_run_in_cudagraph(common_attn_metadata) + for g in self._attn_group_iterator()) # Hot-Swap lora model if self.lora_config: @@ -2229,11 +2201,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): block_table[kv_cache_group_id].slot_mapping[:num_tokens], causal=True) - attn_metadata_i = self.attn_metadata_builders[ - kv_cache_group_id].build_for_cudagraph_capture( - common_attn_metadata) - for layer_name in kv_cache_group_spec.layer_names: - attn_metadata[layer_name] = attn_metadata_i + for attn_group in self.attn_groups[kv_cache_group_id]: + attn_metadata_i = attn_group.metadata_builder\ + .build_for_cudagraph_capture(common_attn_metadata) + for layer_name in kv_cache_group_spec.layer_names: + attn_metadata[layer_name] = attn_metadata_i with self.maybe_dummy_run_with_lora(self.lora_config, num_scheduled_tokens): @@ -2565,88 +2537,100 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", elapsed_time, cuda_graph_size / (1 << 30)) - def _initialize_single_attn_backend( - self, kv_cache_spec: KVCacheSpec, layer_names: list[str] - ) -> tuple[AttentionBackend, AttentionMetadataBuilder]: - if isinstance(kv_cache_spec, AttentionSpec): - attn_backend_i = get_attn_backend( - kv_cache_spec.head_size, - self.dtype, - kv_cache_spec.dtype, - kv_cache_spec.block_size, - self.model_config.is_attention_free, - use_mla=kv_cache_spec.use_mla, - ) - if attn_backend_i is None: - error_msg = (f"Error with get_attn_backend: " - f"{kv_cache_spec.head_size=}, " - f"{self.dtype=}, {kv_cache_spec.dtype=}, " - f"{kv_cache_spec.block_size=}, " - f"{self.model_config.is_attention_free=}, " - f"{kv_cache_spec.use_mla=}") - logger.error(error_msg) - raise NotImplementedError( - "Non-Attention backend is not supported by V1 " - "GPUModelRunner.") - elif isinstance(kv_cache_spec, MambaSpec): - attn_backend_i = get_mamba_attn_backend(kv_cache_spec.mamba_type) - else: - raise ValueError( - f"Unknown KV cache spec type: {type(kv_cache_spec)}") - - attn_metadata_builder_i = attn_backend_i.get_builder_cls()( - kv_cache_spec, - layer_names, - self.vllm_config, - self.device, - ) - - if self.full_cuda_graph: - if attn_metadata_builder_i.attn_cudagraph_support == \ - AttentionCGSupport.NEVER: - raise ValueError(f"Full CUDAGraph not supported for " - f"{attn_backend_i.__name__}. Turn off " - f"CompilationConfig.full_cuda_graph or use a " - f" different attention backend.") - if attn_metadata_builder_i.attn_cudagraph_support == \ - AttentionCGSupport.PURE_DECODE_ONLY: - # Limit the max cudagraph size to the max number of - # sequences for pure decode only cudagraph backend, - # whose max_query_len is 1. - self.cudagraph_batch_sizes = [ - size for size in self.cudagraph_batch_sizes - if size <= self.scheduler_config.max_num_seqs - ] - return attn_backend_i, attn_metadata_builder_i - def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: """ Initialize the attention backends and attention metadata builders. """ - assert len(self.attn_backends) == 0 and len( - self.attn_metadata_builders - ) == 0, "Attention backends are already initialized" - for i, kv_cache_group_spec in enumerate( - kv_cache_config.kv_cache_groups): - kv_cache_spec = kv_cache_group_spec.kv_cache_spec + assert len(self.attn_groups) == 0, \ + "Attention backends are already initialized" + attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention) - attn_backend_i, attn_metadata_builder_i = ( - self._initialize_single_attn_backend( - kv_cache_spec, kv_cache_group_spec.layer_names)) - self.attn_backends.append(attn_backend_i) - self.attn_metadata_builders.append(attn_metadata_builder_i) + def get_attn_backends_for_layers( + layer_names: list[str] + ) -> dict[type[AttentionBackend], list[str]]: + attn_backends = {} + attn_backend_layers = defaultdict(list) + # Dedupe based on full class name; this is a bit safer than using + # using the class itself as the key because when we create dynamic + # attention backend subclasses (e.g. ChunkedLocalAttention) unless + # they are cached correctly, there will be different objects per + # layer. + for layer_name in layer_names: + attn_backend = attn_layers[layer_name].get_attn_backend() + key = attn_backend.full_cls_name() + attn_backends[key] = attn_backend + attn_backend_layers[key].append(layer_name) + return { + attn_backends[k]: v + for k, v in attn_backend_layers.items() + } + + def create_attn_groups( + attn_backends_map: dict[AttentionBackend, list[str]], + kv_cache_spec: KVCacheSpec, + ) -> list[AttentionGroup]: + attn_groups: list[AttentionGroup] = [] + for attn_backend, layer_names in attn_backends_map.items(): + attn_metadata_builder_i = attn_backend.get_builder_cls()( + kv_cache_spec, + layer_names, + self.vllm_config, + self.device, + ) + attn_group = AttentionGroup(attn_backend, + attn_metadata_builder_i, + layer_names) + attn_groups.append(attn_group) + + if self.full_cuda_graph: + if attn_metadata_builder_i.attn_cudagraph_support == \ + AttentionCGSupport.NEVER: + raise ValueError( + f"Full CUDAGraph not supported for " + f"{attn_backend.__name__}. Turn off " + f"CompilationConfig.full_cuda_graph or use a " + f" different attention backend.") + if attn_metadata_builder_i.attn_cudagraph_support == \ + AttentionCGSupport.PURE_DECODE_ONLY: + # Limit the max cudagraph size to the max number of + # sequences for pure decode only cudagraph backend, + # whose max_query_len is 1. + self.cudagraph_batch_sizes = [ + size for size in self.cudagraph_batch_sizes + if size <= self.scheduler_config.max_num_seqs + ] + + return attn_groups + + for kv_cache_group_spec in kv_cache_config.kv_cache_groups: + kv_cache_spec = kv_cache_group_spec.kv_cache_spec + if isinstance(kv_cache_spec, AttentionSpec): + attn_backends = get_attn_backends_for_layers( + kv_cache_group_spec.layer_names) + # TODO(lucas): move `get_mamba_attn_backend` into the mamba + # layers like above + elif isinstance(kv_cache_spec, MambaSpec): + attn_backends = { + get_mamba_attn_backend(kv_cache_spec.mamba_type): + kv_cache_group_spec.layer_names + } + else: + raise ValueError( + f"Unknown KV cache spec type: {type(kv_cache_spec)}") + + self.attn_groups.append( + create_attn_groups(attn_backends, kv_cache_spec)) # Calculate reorder batch threshold (if neeeded) self.calculate_reorder_batch_threshold() - if len(self.attn_backends) > 0: + if len(self.attn_groups) > 0: return # Check if model is encoder-only block_size = self.vllm_config.cache_config.block_size use_mla = self.vllm_config.model_config.use_mla attn_specs = list[AttentionSpec]() - attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention) for attn_module in attn_layers.values(): if attn_module.attn_type == AttentionType.ENCODER_ONLY: @@ -2666,11 +2650,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): assert len(attn_specs) == len(attn_layers), \ "All or none of the layers are expected to be encoder-only" - attn_backend, attn_metadata_builder = ( - self._initialize_single_attn_backend(attn_specs[0], - attn_layers.keys())) - self.attn_backends.append(attn_backend) - self.attn_metadata_builders.append(attn_metadata_builder) + attn_backends = get_attn_backends_for_layers(attn_layers.keys()) + + self.attn_groups.append( + create_attn_groups(attn_backends, attn_specs[0])) self.is_encoder_only_model = True def calculate_reorder_batch_threshold(self) -> None: @@ -2678,7 +2661,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): Check that if any backends reorder batches; that the reordering is compatible (e.g., decode threshold is the same) """ - for attn_metadata_builder_i in self.attn_metadata_builders: + for group in self._attn_group_iterator(): + attn_metadata_builder_i = group.metadata_builder + # check that if any backends reorder batches; that the reordering # is compatible (e.g., decode threshold is the same) reorder_batch_threshold_i = ( @@ -2752,6 +2737,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): )), "Some layers are not correctly initialized" return kv_cache_raw_tensors + def _attn_group_iterator(self) -> Iterator[AttentionGroup]: + return itertools.chain.from_iterable(self.attn_groups) + + def _kv_cache_spec_attn_group_iterator( + self) -> Iterator[tuple[KVCacheSpec, AttentionGroup]]: + if not self.kv_cache_config.kv_cache_groups: + return + for kv_cache_spec_id, attn_groups in enumerate(self.attn_groups): + for attn_group in attn_groups: + yield self.kv_cache_config.kv_cache_groups[ + kv_cache_spec_id].kv_cache_spec, attn_group + def _reshape_kv_cache_tensors( self, kv_cache_config: KVCacheConfig, @@ -2770,23 +2767,22 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): """ kv_caches: dict[str, torch.Tensor] = {} has_attn, has_mamba = False, False - for i, kv_cache_group_spec in enumerate( - kv_cache_config.kv_cache_groups): - kv_cache_spec = kv_cache_group_spec.kv_cache_spec - for layer_name in kv_cache_group_spec.layer_names: + for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator(): + attn_backend = group.backend + for layer_name in group.layer_names: raw_tensor = kv_cache_raw_tensors[layer_name] assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0 num_blocks = (raw_tensor.numel() // kv_cache_spec.page_size_bytes) if isinstance(kv_cache_spec, AttentionSpec): has_attn = True - kv_cache_shape = self.attn_backends[i].get_kv_cache_shape( + kv_cache_shape = attn_backend.get_kv_cache_shape( num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) dtype = kv_cache_spec.dtype try: - kv_cache_stride_order = self.attn_backends[ - i].get_kv_cache_stride_order() + kv_cache_stride_order = \ + attn_backend.get_kv_cache_stride_order() assert len(kv_cache_stride_order) == len( kv_cache_shape) except (AttributeError, NotImplementedError): @@ -2850,15 +2846,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): kv_cache_raw_tensors: The KV cache buffer of each layer. """ - for i, kv_cache_group_spec in enumerate( - kv_cache_config.kv_cache_groups): - kv_cache_spec = kv_cache_group_spec.kv_cache_spec - for layer_name in kv_cache_group_spec.layer_names: + for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator(): + for layer_name in group.layer_names: raw_tensor = kv_cache_raw_tensors[layer_name] num_blocks = (raw_tensor.numel() // kv_cache_spec.page_size_bytes) if isinstance(kv_cache_spec, AttentionSpec): - kv_cache_shape = self.attn_backends[i].get_kv_cache_shape( + + kv_cache_shape = group.backend.get_kv_cache_shape( num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) if kv_cache_shape[0] != num_blocks or kv_cache_shape[ @@ -2893,6 +2888,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.shared_kv_cache_layers, kv_cache_config.kv_cache_groups, kv_caches, + self.attn_groups, ) attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention) @@ -2958,9 +2954,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): continue # TODO: Support other attention modules, e.g., cross-attention + # TODO(lucas): move the attention specs into the model layers like + # the attention backends if attn_module.attn_type == AttentionType.DECODER: - use_local_attention = (self.attention_chunk_size is not None - and attn_module.use_irope) if attn_module.sliding_window is not None: kv_cache_spec[layer_name] = SlidingWindowSpec( block_size=block_size, @@ -2969,10 +2965,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): dtype=self.kv_cache_dtype, sliding_window=attn_module.sliding_window, use_mla=use_mla) - assert not use_local_attention, ( - "attention module can not be with ", - "both local attention and sliding window") - elif use_local_attention: + elif self.attention_chunk_size is not None \ + and isinstance(attn_module, ChunkedLocalAttention): kv_cache_spec[layer_name] = ChunkedLocalAttentionSpec( block_size=block_size, num_kv_heads=attn_module.num_kv_heads, @@ -3043,7 +3037,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Use the first attention metadata builder # to create encoder attention metadata - builder = self.attn_metadata_builders[0] + builder = self.attn_groups[0][0].metadata_builder dummy_block_table = torch.zeros((num_reqs, 1), dtype=torch.int32, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 5f3188efdb..81252f9b60 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -15,8 +15,9 @@ import torch_xla.distributed.spmd as xs import torch_xla.runtime as xr import vllm.envs as envs +from vllm.attention import Attention from vllm.attention.backends.abstract import AttentionType -from vllm.attention.layer import Attention +from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher from vllm.config import (ParallelConfig, VllmConfig, get_layers_from_vllm_config, update_config) @@ -518,7 +519,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): continue if attn_module.attn_type == AttentionType.DECODER: - if attn_module.use_irope: + if isinstance(attn_module, ChunkedLocalAttention): logger.warning_once( "Using irope in Pallas is not supported yet, it " "will fall back to global attention for long context.") diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 6761b3c5e4..e7079235d6 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -1,14 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import defaultdict +from dataclasses import dataclass from typing import TYPE_CHECKING, Optional import torch +from vllm.attention.backends.abstract import AttentionBackend from vllm.config import ModelConfig, SchedulerConfig from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index from vllm.multimodal.registry import MultiModalRegistry +from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import KVCacheGroupSpec @@ -122,6 +125,13 @@ class MultiModalBudget: return max_items_per_prompt, max_items_per_batch +@dataclass +class AttentionGroup: + backend: type[AttentionBackend] + metadata_builder: AttentionMetadataBuilder + layer_names: list[str] + + def sanity_check_mm_encoder_outputs( mm_embeddings: MultiModalEmbeddings, expected_num_items: int, @@ -196,6 +206,8 @@ def initialize_kv_cache_for_kv_sharing( shared_kv_cache_layers: dict[str, str], kv_cache_groups: list[KVCacheGroupSpec], kv_caches: dict[str, torch.Tensor], + # Optional for now to avoid breaking TPU + attn_groups: Optional[list[list[AttentionGroup]]] = None, ) -> None: """ Sets up KV cache sharing by reusing the allocated KV caches in `kv_caches` @@ -225,6 +237,15 @@ def initialize_kv_cache_for_kv_sharing( group_idx = layer_to_kv_cache_group_idx[target_layer_name] kv_cache_groups[group_idx].layer_names.append(layer_name) + if attn_groups is not None: + assert len(attn_groups[group_idx]) == 1, ( + "Only one attention group per KV cache group is supported " + "for KV-cache sharing for now.") + # TODO(lucas): I think in the future the layers that re-use a + # KV cache will be in a different attention group so we can + # remove this code from here. + attn_groups[group_idx][0].layer_names.append(layer_name) + def bind_kv_cache( kv_caches: dict[str, torch.Tensor], From 6b47ef24de3d3b4f551aca0bc21b9f16f3d21b6a Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Thu, 7 Aug 2025 10:28:11 +0800 Subject: [PATCH 040/932] [XPU]Fix `flash_attn_varlen_func` interface on xpu (#22350) Signed-off-by: Kunshang Ji --- vllm/_ipex_ops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 7533bf5ef7..79e3e448ca 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -271,6 +271,7 @@ class ipex_ops: k_descale=None, v_descale=None, num_splits=0, + s_aux: Optional[torch.Tensor] = None, ): if cu_seqlens_k is None: # cu_seqlens_k is not used in ipex kernel. From 7377131a2ccb49cae71aa503ee5be520aa080904 Mon Sep 17 00:00:00 2001 From: Tao He Date: Thu, 7 Aug 2025 10:58:08 +0800 Subject: [PATCH 041/932] [Qwen3] Enable dual-chunk-attention support for Qwen3 models. (#21924) Signed-off-by: Tao He --- vllm/model_executor/models/qwen3.py | 64 +++++++++++++++---------- vllm/model_executor/models/qwen3_moe.py | 27 ++++++++--- 2 files changed, 60 insertions(+), 31 deletions(-) diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index d2ae8959b1..0ad50640bb 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -23,7 +23,7 @@ # limitations under the License. """Inference-only Qwen3 model compatible with HuggingFace weights.""" from collections.abc import Iterable -from typing import Optional, Union +from typing import Any, Optional, Union import torch from torch import nn @@ -47,27 +47,31 @@ from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP from .qwen2 import Qwen2MLP as Qwen3MLP from .qwen2 import Qwen2Model -from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix +from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index, + maybe_prefix) logger = init_logger(__name__) class Qwen3Attention(nn.Module): - def __init__(self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - max_position: int = 4096 * 32, - head_dim: Optional[int] = None, - rms_norm_eps: float = 1e-06, - qkv_bias: bool = False, - rope_theta: float = 10000, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - rope_scaling: Optional[tuple] = None, - prefix: str = "", - attn_type: str = AttentionType.DECODER) -> None: + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + max_position: int = 4096 * 32, + head_dim: Optional[int] = None, + rms_norm_eps: float = 1e-06, + qkv_bias: bool = False, + rope_theta: float = 10000, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + rope_scaling: Optional[tuple] = None, + prefix: str = "", + attn_type: str = AttentionType.DECODER, + dual_chunk_attention_config: Optional[dict[str, Any]] = None, + ) -> None: super().__init__() self.hidden_size = hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -89,6 +93,7 @@ class Qwen3Attention(nn.Module): self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta + self.dual_chunk_attention_config = dual_chunk_attention_config self.qkv_proj = QKVParallelLinear( hidden_size, @@ -113,15 +118,22 @@ class Qwen3Attention(nn.Module): max_position=max_position, base=self.rope_theta, rope_scaling=rope_scaling, + dual_chunk_attention_config=dual_chunk_attention_config, + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + attn_type=attn_type, + **{ + "layer_idx": extract_layer_index(prefix), + "dual_chunk_attention_config": dual_chunk_attention_config, + } if dual_chunk_attention_config else {}, ) - self.attn = Attention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - attn_type=attn_type) self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) @@ -161,6 +173,9 @@ class Qwen3DecoderLayer(nn.Module): # Requires transformers > 4.32.0 rope_theta = getattr(config, "rope_theta", 1000000) rope_scaling = getattr(config, "rope_scaling", None) + dual_chunk_attention_config = getattr(config, + "dual_chunk_attention_config", + None) # By default, Qwen3 uses causal attention as it is a decoder-only model. # You can override the HF config with `is_causal=False` to enable @@ -185,6 +200,7 @@ class Qwen3DecoderLayer(nn.Module): rope_scaling=rope_scaling, prefix=f"{prefix}.self_attn", attn_type=attn_type, + dual_chunk_attention_config=dual_chunk_attention_config, ) self.mlp = Qwen3MLP( hidden_size=self.hidden_size, diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index ca14fd0657..7410589190 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -185,6 +185,7 @@ class Qwen3MoeAttention(nn.Module): cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + dual_chunk_attention_config: Optional[dict[str, Any]] = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -208,6 +209,7 @@ class Qwen3MoeAttention(nn.Module): self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings + self.dual_chunk_attention_config = dual_chunk_attention_config self.qkv_proj = QKVParallelLinear(hidden_size, self.head_dim, @@ -229,14 +231,21 @@ class Qwen3MoeAttention(nn.Module): max_position=max_position_embeddings, base=rope_theta, rope_scaling=rope_scaling, + dual_chunk_attention_config=dual_chunk_attention_config, + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + **{ + "layer_idx": extract_layer_index(prefix), + "dual_chunk_attention_config": dual_chunk_attention_config, + } if dual_chunk_attention_config else {}, ) - self.attn = Attention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn") self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) @@ -280,6 +289,9 @@ class Qwen3MoeDecoderLayer(nn.Module): rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + dual_chunk_attention_config = getattr(config, + "dual_chunk_attention_config", + None) self.self_attn = Qwen3MoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, @@ -293,6 +305,7 @@ class Qwen3MoeDecoderLayer(nn.Module): cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + dual_chunk_attention_config=dual_chunk_attention_config, ) # `mlp_only_layers` in the config. From 04cf435d95fee3e4c0ba521583c1a64bc348c89d Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 7 Aug 2025 11:05:20 +0800 Subject: [PATCH 042/932] [Bugfix] Fix wrong method name in Intern-S1 image processor (#22417) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/interns1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index ab21cbe91a..d952ced2fa 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -161,7 +161,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo): if not isinstance(processor, GotOcr2ImageProcessorFast): raise ValueError(f'GotOcr2ImageProcessorFast is expected but got ' f'{type(processor)}') - num_image_patches = processor.get_number_of_image_tokens( + num_image_patches = processor.get_number_of_image_patches( image_height, image_width, images_kwargs=dict()) num_image_tokens = self.get_hf_processor( ).image_seq_length * num_image_patches From a00d8b236f515d8c29c6afc2ecb98aef22788ae1 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 6 Aug 2025 23:07:47 -0400 Subject: [PATCH 043/932] Use float32 for test_completion.py (#22385) Signed-off-by: Michael Goin --- tests/v1/entrypoints/openai/test_completion.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py index 2462f8f9f1..3a65583fab 100644 --- a/tests/v1/entrypoints/openai/test_completion.py +++ b/tests/v1/entrypoints/openai/test_completion.py @@ -20,9 +20,8 @@ MODEL_NAME = "facebook/opt-125m" @pytest.fixture(scope="module") def default_server_args(): return [ - # use half precision for speed and memory savings in CI environment "--dtype", - "bfloat16", + "float32", "--max-model-len", "2048", "--max-num-seqs", From 5e9455ae8f33599865f8855b28db2d074ea04eb5 Mon Sep 17 00:00:00 2001 From: qscqesze Date: Thu, 7 Aug 2025 11:30:27 +0800 Subject: [PATCH 044/932] [Bugfix]: Fix the streaming output for function calls in the minimax (#22015) Signed-off-by: QscQ Signed-off-by: qingjun --- tests/tool_use/test_minimax_tool_parser.py | 846 ++++++++++++++++- .../tool_parsers/minimax_tool_parser.py | 850 +++++++++++++----- 2 files changed, 1493 insertions(+), 203 deletions(-) diff --git a/tests/tool_use/test_minimax_tool_parser.py b/tests/tool_use/test_minimax_tool_parser.py index 49b8e4b96f..ddf2600712 100644 --- a/tests/tool_use/test_minimax_tool_parser.py +++ b/tests/tool_use/test_minimax_tool_parser.py @@ -3,10 +3,12 @@ # ruff: noqa: E501 import json +from typing import Any import pytest -from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall +from vllm.entrypoints.openai.protocol import (ChatCompletionToolsParam, + FunctionCall, ToolCall) from vllm.entrypoints.openai.tool_parsers import MinimaxToolParser from vllm.transformers_utils.tokenizer import get_tokenizer @@ -24,6 +26,57 @@ def minimax_tool_parser(minimax_tokenizer): return MinimaxToolParser(minimax_tokenizer) +@pytest.fixture +def sample_tools(): + return [ + ChatCompletionToolsParam(type="function", + function={ + "name": "get_current_weather", + "description": "Get the current weather", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The city name" + }, + "state": { + "type": "string", + "description": + "The state code" + }, + "unit": { + "type": "string", + "enum": + ["fahrenheit", "celsius"] + } + }, + "required": ["city", "state"] + } + }), + ChatCompletionToolsParam(type="function", + function={ + "name": "calculate_area", + "description": + "Calculate area of a shape", + "parameters": { + "type": "object", + "properties": { + "shape": { + "type": "string" + }, + "dimensions": { + "type": "object" + }, + "precision": { + "type": "integer" + } + } + } + }) + ] + + def assert_tool_calls(actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]): assert len(actual_tool_calls) == len(expected_tool_calls) @@ -370,3 +423,794 @@ def test_extract_tool_calls_multiline_json_not_supported(minimax_tool_parser): assert not extracted_tool_calls.tools_called assert extracted_tool_calls.tool_calls == [] assert extracted_tool_calls.content is None + + +def test_streaming_arguments_incremental_output(minimax_tool_parser): + """Test that streaming arguments are returned incrementally, not cumulatively.""" + # Reset streaming state + minimax_tool_parser.current_tool_name_sent = False + minimax_tool_parser.prev_tool_call_arr = [] + minimax_tool_parser.current_tool_id = -1 + minimax_tool_parser.streamed_args_for_tool = [] + + # Simulate progressive tool call building + stages = [ + # Stage 1: Function name complete + '\n{"name": "get_current_weather", "arguments": ', + # Stage 2: Arguments object starts with first key + '\n{"name": "get_current_weather", "arguments": {"city": ', + # Stage 3: First parameter value added + '\n{"name": "get_current_weather", "arguments": {"city": "Seattle"', + # Stage 4: Second parameter added + '\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA"', + # Stage 5: Third parameter added, arguments complete + '\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}', + # Stage 6: Tool calls closed + '\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n' + ] + + function_name_sent = False + previous_args_content = "" + + for i, current_text in enumerate(stages): + previous_text = stages[i - 1] if i > 0 else "" + delta_text = current_text[len(previous_text + ):] if i > 0 else current_text + + result = minimax_tool_parser.extract_tool_calls_streaming( + previous_text=previous_text, + current_text=current_text, + delta_text=delta_text, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=None, + ) + + print(f"Stage {i}: Current text: {repr(current_text)}") + print(f"Stage {i}: Delta text: {repr(delta_text)}") + + if result is not None and hasattr(result, + 'tool_calls') and result.tool_calls: + tool_call = result.tool_calls[0] + + # Check if function name is sent (should happen only once) + if tool_call.function and tool_call.function.name: + assert tool_call.function.name == "get_current_weather" + function_name_sent = True + print( + f"Stage {i}: Function name sent: {tool_call.function.name}" + ) + + # Check if arguments are sent incrementally + if tool_call.function and tool_call.function.arguments: + args_fragment = tool_call.function.arguments + print( + f"Stage {i}: Got arguments fragment: {repr(args_fragment)}" + ) + + # For incremental output, each fragment should be new content only + # The fragment should not contain all previous content + if i >= 2 and previous_args_content: # After we start getting arguments + # The new fragment should not be identical to or contain all previous content + assert args_fragment != previous_args_content, f"Fragment should be incremental, not cumulative: {args_fragment}" + + # If this is truly incremental, the fragment should be relatively small + # compared to the complete arguments so far + if len(args_fragment) > len(previous_args_content): + print( + "Warning: Fragment seems cumulative rather than incremental" + ) + + previous_args_content = args_fragment + + # Verify function name was sent at least once + assert function_name_sent, "Function name should have been sent" + + +def test_streaming_arguments_delta_only(minimax_tool_parser): + """Test that each streaming call returns only the delta (new part) of arguments.""" + # Reset streaming state + minimax_tool_parser.current_tool_name_sent = False + minimax_tool_parser.prev_tool_call_arr = [] + minimax_tool_parser.current_tool_id = -1 + minimax_tool_parser.streamed_args_for_tool = [] + + # Simulate two consecutive calls with growing arguments + call1_text = '\n{"name": "test_tool", "arguments": {"param1": "value1"}}' + call2_text = '\n{"name": "test_tool", "arguments": {"param1": "value1", "param2": "value2"}}' + + print(f"Call 1 text: {repr(call1_text)}") + print(f"Call 2 text: {repr(call2_text)}") + + # First call - should get the function name and initial arguments + result1 = minimax_tool_parser.extract_tool_calls_streaming( + previous_text="", + current_text=call1_text, + delta_text=call1_text, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=None, + ) + + print(f"Result 1: {result1}") + if result1 and hasattr(result1, 'tool_calls') and result1.tool_calls: + for i, tc in enumerate(result1.tool_calls): + print(f" Tool call {i}: {tc}") + + # Second call - should only get the delta (new part) of arguments + result2 = minimax_tool_parser.extract_tool_calls_streaming( + previous_text=call1_text, + current_text=call2_text, + delta_text=', "param2": "value2"}', + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=None, + ) + + print(f"Result 2: {result2}") + if result2 and hasattr(result2, 'tool_calls') and result2.tool_calls: + for i, tc in enumerate(result2.tool_calls): + print(f" Tool call {i}: {tc}") + + # Verify the second call only returns the delta + if result2 is not None and hasattr(result2, + 'tool_calls') and result2.tool_calls: + tool_call = result2.tool_calls[0] + if tool_call.function and tool_call.function.arguments: + args_delta = tool_call.function.arguments + print(f"Arguments delta from second call: {repr(args_delta)}") + + # Should only contain the new part, not the full arguments + # The delta should be something like ', "param2": "value2"}' or just '"param2": "value2"' + assert ', "param2": "value2"}' in args_delta or '"param2": "value2"' in args_delta, f"Expected delta containing param2, got: {args_delta}" + + # Should NOT contain the previous parameter data + assert '"param1": "value1"' not in args_delta, f"Arguments delta should not contain previous data: {args_delta}" + + # The delta should be relatively short (incremental, not cumulative) + expected_max_length = len( + ', "param2": "value2"}') + 10 # Some tolerance + assert len( + args_delta + ) <= expected_max_length, f"Delta seems too long (possibly cumulative): {args_delta}" + + print("✓ Delta validation passed") + else: + print("No arguments in result2 tool call") + else: + print("No tool calls in result2 or result2 is None") + # This might be acceptable if no incremental update is needed + # But let's at least verify that result1 had some content + assert result1 is not None, "At least the first call should return something" + + +def test_streaming_openai_compatibility(minimax_tool_parser): + """Test that streaming behavior with buffering works correctly.""" + # Reset streaming state + minimax_tool_parser.current_tool_name_sent = False + minimax_tool_parser.prev_tool_call_arr = [] + minimax_tool_parser.current_tool_id = -1 + minimax_tool_parser.streamed_args_for_tool = [] + # Reset buffering state + minimax_tool_parser.pending_buffer = "" + minimax_tool_parser.in_thinking_tag = False + minimax_tool_parser.thinking_depth = 0 + + # Test scenario: simple buffering without complex tool call context + test_cases: list[dict[str, Any]] = [ + { + 'stage': 'Token: <', + 'previous': '', + 'current': '<', + 'delta': '<', + 'expected_content': None, # Should be buffered + }, + { + 'stage': 'Token: tool_calls>', + 'previous': '<', + 'current': '', + 'delta': 'tool_calls>', + 'expected_content': None, # Complete tag, should not output + }, + { + 'stage': 'Regular content', + 'previous': 'Hello', + 'current': 'Hello world', + 'delta': ' world', + 'expected_content': ' world', # Normal content should pass through + }, + { + 'stage': 'Content with end tag start', + 'previous': 'Text', + 'current': 'Text content', + 'delta': 'calls>', + 'expected_content': None, # Complete close tag, should not output + }, + ] + + for i, test_case in enumerate(test_cases): + print(f"\n--- Stage {i}: {test_case['stage']} ---") + print(f"Previous: {repr(test_case['previous'])}") + print(f"Current: {repr(test_case['current'])}") + print(f"Delta: {repr(test_case['delta'])}") + + result = minimax_tool_parser.extract_tool_calls_streaming( + previous_text=test_case['previous'], + current_text=test_case['current'], + delta_text=test_case['delta'], + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=None, + ) + + print(f"Result: {result}") + + # Check expected content + if test_case['expected_content'] is None: + assert result is None or not getattr(result, 'content', None), \ + f"Stage {i}: Expected no content, got {result}" + print("✓ No content output as expected") + else: + assert result is not None and hasattr(result, 'content'), \ + f"Stage {i}: Expected content, got {result}" + assert result.content == test_case['expected_content'], \ + f"Stage {i}: Expected content {test_case['expected_content']}, got {result.content}" + print(f"✓ Content matches: {repr(result.content)}") + + print("✓ Streaming test with buffering completed successfully") + + +def test_streaming_thinking_tag_buffering(minimax_tool_parser): + """Test that tool calls within thinking tags are properly handled during streaming.""" + # Reset streaming state + minimax_tool_parser.current_tool_name_sent = False + minimax_tool_parser.prev_tool_call_arr = [] + minimax_tool_parser.current_tool_id = -1 + minimax_tool_parser.streamed_args_for_tool = [] + # Reset buffering state + minimax_tool_parser.pending_buffer = "" + minimax_tool_parser.in_thinking_tag = False + minimax_tool_parser.thinking_depth = 0 + + # Test scenario: tool calls within thinking tags should be ignored + test_cases: list[dict[str, Any]] = [ + { + 'stage': 'Start thinking', + 'previous': '', + 'current': 'I need to use a tool. ', + 'delta': 'I need to use a tool. ', + 'expected_content': + 'I need to use a tool. ', # Should pass through as content + }, + { + 'stage': + 'Tool call in thinking', + 'previous': + 'I need to use a tool. ', + 'current': + 'I need to use a tool. \n{"name": "ignored_tool", "arguments": {"param": "value"}}\n', + 'delta': + '\n{"name": "ignored_tool", "arguments": {"param": "value"}}\n', + 'expected_content': + '\n{"name": "ignored_tool", "arguments": {"param": "value"}}\n', # should be preserved in thinking tags + }, + { + 'stage': 'Real tool call after thinking', + 'previous': + 'I need to use a tool. \n{"name": "ignored_tool", "arguments": {"param": "value"}}\n', + 'current': + 'I need to use a tool. \n{"name": "ignored_tool", "arguments": {"param": "value"}}\n\n', + 'delta': '\n', + 'expected_content': + '\n', # Should output '\n' and suppress + } + ] + + for i, test_case in enumerate(test_cases): + print(f"\n--- Stage {i}: {test_case['stage']} ---") + print(f"Previous: {repr(test_case['previous'])}") + print(f"Current: {repr(test_case['current'])}") + print(f"Delta: {repr(test_case['delta'])}") + + result = minimax_tool_parser.extract_tool_calls_streaming( + previous_text=test_case['previous'], + current_text=test_case['current'], + delta_text=test_case['delta'], + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=None, + ) + + print(f"Result: {result}") + + # Check expected content + if 'expected_content' in test_case: + if test_case['expected_content'] is None: + assert result is None or not getattr(result, 'content', None), \ + f"Stage {i}: Expected no content, got {result}" + else: + assert result is not None and hasattr(result, 'content'), \ + f"Stage {i}: Expected content, got {result}" + assert result.content == test_case['expected_content'], \ + f"Stage {i}: Expected content {test_case['expected_content']}, got {result.content}" + print(f"✓ Content matches: {repr(result.content)}") + + # Check tool calls + if test_case.get('expected_tool_call'): + assert result is not None and hasattr(result, 'tool_calls') and result.tool_calls, \ + f"Stage {i}: Expected tool call, got {result}" + + tool_call = result.tool_calls[0] + assert tool_call.function.name == "real_tool", \ + f"Expected real_tool, got {tool_call.function.name}" + print(f"✓ Real tool call detected: {tool_call.function.name}") + + print("✓ Thinking tag buffering test completed successfully") + + +def reset_streaming_state(minimax_tool_parser): + """Helper function to properly reset the streaming state for MinimaxToolParser.""" + # Reset minimax-specific state + minimax_tool_parser._reset_streaming_state() + + # Reset base class state (these should still be reset for compatibility) + minimax_tool_parser.prev_tool_call_arr = [] + minimax_tool_parser.current_tool_id = -1 + minimax_tool_parser.current_tool_name_sent = False + minimax_tool_parser.streamed_args_for_tool = [] + + +def test_streaming_complex_scenario_with_multiple_tools(minimax_tool_parser): + """Test complex streaming scenario: tools inside tags and multiple tool calls in one group.""" + # Reset streaming state + reset_streaming_state(minimax_tool_parser) + + # Complex scenario: tools inside thinking tags and multiple tools in one group + test_stages: list[dict[str, Any]] = [ + { + 'stage': 'Initial content', + 'previous': '', + 'current': 'Let me help you with this task.', + 'delta': 'Let me help you with this task.', + 'expected_content': 'Let me help you with this task.', + 'expected_tool_calls': 0, + }, + { + 'stage': 'Start thinking tag', + 'previous': 'Let me help you with this task.', + 'current': + 'Let me help you with this task.I need to analyze this situation first.', + 'delta': 'I need to analyze this situation first.', + 'expected_content': + 'I need to analyze this situation first.', + 'expected_tool_calls': 0, + }, + { + 'stage': 'Tool call inside thinking tag starts', + 'previous': + 'Let me help you with this task.I need to analyze this situation first.', + 'current': + 'Let me help you with this task.I need to analyze this situation first.', + 'delta': '', + 'expected_content': + '', # Inside thinking tags, tool tags should be preserved as content + 'expected_tool_calls': 0, + }, + { + 'stage': 'Complete tool call inside thinking tag', + 'previous': + 'Let me help you with this task.I need to analyze this situation first.', + 'current': + 'Let me help you with this task.I need to analyze this situation first.\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n', + 'delta': + '\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n', + 'expected_content': + '\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n', + 'expected_tool_calls': + 0, # Tools inside thinking tags should be ignored + }, + { + 'stage': 'End thinking tag', + 'previous': + 'Let me help you with this task.I need to analyze this situation first.\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n', + 'current': + 'Let me help you with this task.I need to analyze this situation first.\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n', + 'delta': '', + 'expected_content': '', + 'expected_tool_calls': 0, + }, + { + 'stage': 'Multiple tools group starts', + 'previous': + 'Let me help you with this task.I need to analyze this situation first.\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n', + 'current': + 'Let me help you with this task.I need to analyze this situation first.\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n\nNow I need to get weather information and calculate area.', + 'delta': + '\nNow I need to get weather information and calculate area.', + 'expected_content': + '\nNow I need to get weather information and calculate area.', # should be filtered + 'expected_tool_calls': 0, + }, + { + 'stage': 'First tool in group', + 'previous': + 'Let me help you with this task.I need to analyze this situation first.\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n\nNow I need to get weather information and calculate area.', + 'current': + 'Let me help you with this task.I need to analyze this situation first.\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n\nNow I need to get weather information and calculate area.\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}', + 'delta': + '\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}', + 'expected_content': + None, # No content should be output when tool call is in progress + 'expected_tool_calls': 1, + 'expected_tool_name': 'get_current_weather', + }, + { + 'stage': 'Second tool in group', + 'previous': + 'Let me help you with this task.I need to analyze this situation first.\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n\nNow I need to get weather information and calculate area.\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}', + 'current': + 'Let me help you with this task.I need to analyze this situation first.\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n\nNow I need to get weather information and calculate area.\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}}', + 'delta': + '\n{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}}', + 'expected_content': None, + 'expected_tool_calls': 1, + 'expected_tool_name': 'calculate_area', + }, + { + 'stage': 'Complete tool calls group', + 'previous': + 'Let me help you with this task.I need to analyze this situation first.\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n\nNow I need to get weather information and calculate area.\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}}', + 'current': + 'Let me help you with this task.I need to analyze this situation first.\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n\nNow I need to get weather information and calculate area.\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}}', + 'delta': '', + 'expected_content': None, + 'expected_tool_calls': 0, + } + ] + + tool_calls_count = 0 + + for i, test_case in enumerate(test_stages): + print(f"\n--- Stage {i}: {test_case['stage']} ---") + print( + f"Previous: {repr(test_case['previous'][:100])}{'...' if len(test_case['previous']) > 100 else ''}" + ) + print(f"Current: {repr(test_case['current'][-100:])}") + print(f"Delta: {repr(test_case['delta'])}") + + result = minimax_tool_parser.extract_tool_calls_streaming( + previous_text=test_case['previous'], + current_text=test_case['current'], + delta_text=test_case['delta'], + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=None, + ) + + print(f"Result: {result}") + + # Check expected content + if test_case['expected_content'] is None: + assert result is None or not getattr(result, 'content', None), \ + f"Stage {i}: Expected no content output, got {result}" + print("✓ No content output as expected") + else: + assert result is not None and hasattr(result, 'content'), \ + f"Stage {i}: Expected content output, got {result}" + assert result.content == test_case['expected_content'], \ + f"Stage {i}: Expected content {repr(test_case['expected_content'])}, got {repr(result.content)}" + print(f"✓ Content matches: {repr(result.content)}") + + # Check tool calls + expected_tool_calls = test_case['expected_tool_calls'] + actual_tool_calls = len(result.tool_calls) if result and hasattr( + result, 'tool_calls') and result.tool_calls else 0 + + if expected_tool_calls > 0: + assert actual_tool_calls >= expected_tool_calls, \ + f"Stage {i}: Expected at least {expected_tool_calls} tool calls, got {actual_tool_calls}" + + if 'expected_tool_name' in test_case: + # Find the tool call with the expected name + found_tool_call = None + for tool_call in result.tool_calls: + if tool_call.function.name == test_case[ + 'expected_tool_name']: + found_tool_call = tool_call + break + + assert found_tool_call is not None, \ + f"Stage {i}: Expected tool name {test_case['expected_tool_name']} not found in tool calls: {[tc.function.name for tc in result.tool_calls]}" + print(f"✓ Tool call correct: {found_tool_call.function.name}") + + # Ensure tools inside thinking tags are not called + assert found_tool_call.function.name != "internal_analysis", \ + f"Stage {i}: Tool 'internal_analysis' inside thinking tags should not be called" + + tool_calls_count += actual_tool_calls + print(f"✓ Detected {actual_tool_calls} tool calls") + else: + assert actual_tool_calls == 0, \ + f"Stage {i}: Expected no tool calls, got {actual_tool_calls}" + + # Verify overall results + print("\n=== Test Summary ===") + print(f"Total tool calls count: {tool_calls_count}") + assert tool_calls_count >= 2, f"Expected at least 2 valid tool calls (outside thinking tags), but got {tool_calls_count}" + + print("✓ Complex streaming test completed:") + print(" - ✓ Tools inside thinking tags correctly ignored") + print(" - ✓ Two tool groups outside thinking tags correctly parsed") + print(" - ✓ Content and tool call streaming correctly handled") + print(" - ✓ Buffering mechanism works correctly") + + +def test_streaming_character_by_character_output(minimax_tool_parser): + """Test character-by-character streaming output to simulate real streaming scenarios.""" + # Reset streaming state + reset_streaming_state(minimax_tool_parser) + + # Complete text that will be streamed character by character + complete_text = """I'll help you with the weather analysis. Let me think about this. +{"name": "internal_analysis", "arguments": {"type": "thinking"}} +This tool should be ignored. + +Now I'll get the weather information for you. +{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}} +{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}} +Here are the results.""" + + print("\n=== Starting character-by-character streaming test ===") + print(f"Complete text length: {len(complete_text)} characters") + + # Track the streaming results + content_fragments = [] + tool_calls_detected = [] + + # Stream character by character + for i in range(1, len(complete_text) + 1): + current_text = complete_text[:i] + previous_text = complete_text[:i - 1] if i > 1 else "" + delta_text = complete_text[i - 1:i] + + # Show progress every 50 characters + if i % 50 == 0 or i == len(complete_text): + print(f"Progress: {i}/{len(complete_text)} characters") + + # Call the streaming parser + result = minimax_tool_parser.extract_tool_calls_streaming( + previous_text=previous_text, + current_text=current_text, + delta_text=delta_text, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=None, + ) + + # Collect results + if result is not None: + if hasattr(result, 'content') and result.content: + content_fragments.append(result.content) + # Log important content fragments + if any( + keyword in result.content for keyword in + ['', '', '', '']): + print( + f" Char {i}: Content fragment: {repr(result.content)}" + ) + + if hasattr(result, 'tool_calls') and result.tool_calls: + for tool_call in result.tool_calls: + tool_info = { + 'character_position': + i, + 'function_name': + tool_call.function.name + if tool_call.function else None, + 'arguments': + tool_call.function.arguments + if tool_call.function else None, + } + tool_calls_detected.append(tool_info) + print( + f" Char {i}: Tool call detected: {tool_call.function.name}" + ) + if tool_call.function.arguments: + print( + f" Arguments: {repr(tool_call.function.arguments)}" + ) + + # Verify results + print("\n=== Streaming Test Results ===") + print(f"Total content fragments: {len(content_fragments)}") + print(f"Total tool calls detected: {len(tool_calls_detected)}") + + # Reconstruct content from fragments + reconstructed_content = ''.join(content_fragments) + print(f"Reconstructed content length: {len(reconstructed_content)}") + + # Verify thinking tags content is preserved + assert '' in reconstructed_content, "Opening thinking tag should be preserved in content" + assert '' in reconstructed_content, "Closing thinking tag should be preserved in content" + + # Verify that tool calls inside thinking tags are NOT extracted as actual tool calls + thinking_tool_calls = [ + tc for tc in tool_calls_detected + if tc['function_name'] == 'internal_analysis' + ] + assert len( + thinking_tool_calls + ) == 0, f"Tool calls inside thinking tags should be ignored, but found: {thinking_tool_calls}" + + # Verify that real tool calls outside thinking tags ARE extracted + weather_tool_calls = [ + tc for tc in tool_calls_detected + if tc['function_name'] == 'get_current_weather' + ] + area_tool_calls = [ + tc for tc in tool_calls_detected + if tc['function_name'] == 'calculate_area' + ] + print(tool_calls_detected) + assert len(weather_tool_calls + ) > 0, "get_current_weather tool call should be detected" + assert len( + area_tool_calls) > 0, "calculate_area tool call should be detected" + + # Verify tool call arguments are properly streamed + weather_args_found = any(tc['arguments'] for tc in weather_tool_calls + if tc['arguments']) + area_args_found = any(tc['arguments'] for tc in area_tool_calls + if tc['arguments']) + + print(f"Weather tool call with arguments: {weather_args_found}") + print(f"Area tool call with arguments: {area_args_found}") + + # Verify content before and after tool calls + assert 'I\'ll help you with the weather analysis.' in reconstructed_content, "Initial content should be preserved" + assert 'Here are the results.' in reconstructed_content, "Final content should be preserved" + + # Verify that and tags are not included in the final content + # (they should be filtered out when not inside thinking tags) + content_outside_thinking = reconstructed_content + # Remove thinking tag content to check content outside + if '' in content_outside_thinking and '' in content_outside_thinking: + start_think = content_outside_thinking.find('') + end_think = content_outside_thinking.find('') + len('') + content_outside_thinking = content_outside_thinking[: + start_think] + content_outside_thinking[ + end_think:] + + # Outside thinking tags, tool_calls tags should be filtered + tool_calls_in_content = content_outside_thinking.count('') + assert tool_calls_in_content == 0, f" tags should be filtered from content outside thinking tags, but found {tool_calls_in_content}" + + print( + "\n=== Character-by-character streaming test completed successfully ===" + ) + print("✓ Tool calls inside thinking tags correctly ignored") + print("✓ Tool calls outside thinking tags correctly detected") + print("✓ Content properly streamed and reconstructed") + print("✓ Tool call tags properly filtered from content") + print("✓ Character-level streaming works correctly") + + +def test_streaming_character_by_character_simple_tool_call( + minimax_tool_parser): + """Test character-by-character streaming for a simple tool call scenario.""" + # Reset streaming state + reset_streaming_state(minimax_tool_parser) + + # Simple tool call text + simple_text = 'Let me check the weather. \n{"name": "get_weather", "arguments": {"city": "NYC"}}\n' + + print("\n=== Simple character-by-character test ===") + print(f"Text: {repr(simple_text)}") + + content_parts = [] + tool_name_sent = False + tool_args_sent = False + + for i in range(1, len(simple_text) + 1): + current_text = simple_text[:i] + previous_text = simple_text[:i - 1] if i > 1 else "" + delta_text = simple_text[i - 1:i] + + result = minimax_tool_parser.extract_tool_calls_streaming( + previous_text=previous_text, + current_text=current_text, + delta_text=delta_text, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=None, + ) + + if result: + if hasattr(result, 'content') and result.content: + content_parts.append(result.content) + print( + f" Char {i} ({repr(delta_text)}): Content: {repr(result.content)}" + ) + + if hasattr(result, 'tool_calls') and result.tool_calls: + for tool_call in result.tool_calls: + if tool_call.function and tool_call.function.name: + tool_name_sent = True + print( + f" Char {i}: Tool name: {tool_call.function.name}" + ) + if tool_call.function and tool_call.function.arguments: + tool_args_sent = True + print( + f" Char {i}: Tool args: {repr(tool_call.function.arguments)}" + ) + + # Verify basic expectations + reconstructed_content = ''.join(content_parts) + print(f"Final reconstructed content: {repr(reconstructed_content)}") + + assert tool_name_sent, "Tool name should be sent during streaming" + assert tool_args_sent, "Tool arguments should be sent during streaming" + assert "Let me check the weather." in reconstructed_content, "Initial content should be preserved" + + print("✓ Simple character-by-character test passed") + + +def test_streaming_character_by_character_with_buffering(minimax_tool_parser): + """Test character-by-character streaming with edge cases that trigger buffering.""" + # Reset streaming state + reset_streaming_state(minimax_tool_parser) + + # Text that includes potential buffering scenarios + buffering_text = 'Hello world\n{"name": "test"}\ndone' + + print("\n=== Buffering character-by-character test ===") + print(f"Text: {repr(buffering_text)}") + + all_content = [] + + for i in range(1, len(buffering_text) + 1): + current_text = buffering_text[:i] + previous_text = buffering_text[:i - 1] if i > 1 else "" + delta_text = buffering_text[i - 1:i] + + result = minimax_tool_parser.extract_tool_calls_streaming( + previous_text=previous_text, + current_text=current_text, + delta_text=delta_text, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=None, + ) + + if result and hasattr(result, 'content') and result.content: + all_content.append(result.content) + print(f" Char {i} ({repr(delta_text)}): {repr(result.content)}") + + final_content = ''.join(all_content) + print(f"Final content: {repr(final_content)}") + + # The parser should handle the edge case where appears before + assert "Hello" in final_content, "Initial 'Hello' should be preserved" + assert "world" in final_content, "Content after false closing tag should be preserved" + assert "done" in final_content, "Final content should be preserved" + + print("✓ Buffering character-by-character test passed") diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py index 6ba32e38fc..226309ef29 100644 --- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py @@ -3,11 +3,9 @@ import json from collections.abc import Sequence -from typing import Union +from typing import Any, Optional, Union -import partial_json_parser import regex as re -from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import random_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, @@ -17,6 +15,8 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, FunctionCall, ToolCall) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ToolParserManager) +from vllm.entrypoints.openai.tool_parsers.utils import ( + extract_intermediate_diff) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -29,25 +29,32 @@ class MinimaxToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) - self.current_tool_name_sent: bool = False - self.prev_tool_call_arr: list[dict] = [] - self.current_tool_id: int = -1 - self.streamed_args_for_tool: list[str] = [] - - self.tool_call_start_token: str = "" - self.tool_call_end_token: str = "" + # Initialize streaming state for tracking tool call progress + self.streaming_state: dict[str, Any] = { + "current_tool_index": -1, # Index of current tool being processed + "tool_ids": [], # List of tool call IDs + "sent_tools": [], # List of tools that have been sent + } + # Define tool call tokens and patterns + self.tool_call_start_token = "" + self.tool_call_end_token = "" self.tool_call_regex = re.compile( r"(.*?)|(.*)", re.DOTALL) - - # Add regex pattern for thinking tag self.thinking_tag_pattern = r"(.*?)" + self.tool_name_pattern = re.compile(r'"name":\s*"([^"]+)"') + self.tool_args_pattern = re.compile(r'"arguments":\s*') + + # Buffer for handling partial tool calls during streaming + self.pending_buffer = "" + self.in_thinking_tag = False if not self.model_tokenizer: raise ValueError( "The model tokenizer must be passed to the ToolParser " "constructor during construction.") + # Get token IDs for tool call start/end tokens self.tool_call_start_token_id = self.vocab.get( self.tool_call_start_token) self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) @@ -60,33 +67,95 @@ class MinimaxToolParser(ToolParser): def preprocess_model_output(self, model_output: str) -> str: """ - Remove tool calls from within thinking tags to avoid processing them. + Preprocess model output by removing tool calls from thinking tags. + + Args: + model_output: Raw model output string + + Returns: + Preprocessed model output with tool calls removed from thinking tags """ def remove_tool_calls_from_think(match): think_content = match.group(1) - # Remove tool_calls from within the think tag cleaned_content = re.sub(r".*?", "", think_content, flags=re.DOTALL) return f"{cleaned_content}" - # Process thinking tags and remove tool_calls from within them - processed_output = re.sub(self.thinking_tag_pattern, - remove_tool_calls_from_think, - model_output, - flags=re.DOTALL) + return re.sub(self.thinking_tag_pattern, + remove_tool_calls_from_think, + model_output, + flags=re.DOTALL) - return processed_output + def _clean_duplicate_braces(self, args_text: str) -> str: + """ + Clean duplicate closing braces from arguments text. + + Args: + args_text: Raw arguments text + + Returns: + Cleaned arguments text with proper JSON formatting + """ + args_text = args_text.strip() + if not args_text: + return args_text + + try: + json.loads(args_text) + return args_text + except json.JSONDecodeError: + pass + + while args_text.endswith('}}'): + candidate = args_text[:-1] + try: + json.loads(candidate) + return candidate + except json.JSONDecodeError: + args_text = candidate + + return args_text + + def _clean_delta_braces(self, delta_text: str) -> str: + """ + Clean delta text by removing excessive closing braces. + + Args: + delta_text: Delta text to clean + + Returns: + Cleaned delta text + """ + if not delta_text: + return delta_text + + delta_stripped = delta_text.strip() + + if delta_stripped and all(c in '}\n\r\t ' for c in delta_stripped): + brace_count = delta_stripped.count('}') + if brace_count > 1: + return '}\n' if delta_text.endswith('\n') else '}' + + return delta_text def extract_tool_calls( self, model_output: str, request: ChatCompletionRequest, ) -> ExtractedToolCallInformation: - - # Preprocess to remove tool calls from thinking tags + """ + Extract tool calls from model output for non-streaming mode. + + Args: + model_output: Complete model output + request: Chat completion request + + Returns: + ExtractedToolCallInformation containing tool calls and content + """ processed_output = self.preprocess_model_output(model_output) if self.tool_call_start_token not in processed_output: @@ -95,8 +164,8 @@ class MinimaxToolParser(ToolParser): content=model_output) try: - function_call_tuples = ( - self.tool_call_regex.findall(processed_output)) + function_call_tuples = self.tool_call_regex.findall( + processed_output) raw_function_calls = [] for match in function_call_tuples: @@ -124,21 +193,15 @@ class MinimaxToolParser(ToolParser): function_call["arguments"], ensure_ascii=False)))) - # Extract content before the first valid tool call - # Find the position in processed output, then map back to original processed_pos = processed_output.find(self.tool_call_start_token) if processed_pos != -1: - # Get the content before tool calls in processed output processed_content = processed_output[:processed_pos].strip() if processed_content: - # Find the end of this content in the original output - # Look for the last non-empty line of processed content lines = processed_content.split('\n') for line in reversed(lines): line = line.strip() if line: - # Find this line in original output pos = model_output.find(line) if pos != -1: content = model_output[:pos + len(line)] @@ -162,6 +225,445 @@ class MinimaxToolParser(ToolParser): tool_calls=[], content=model_output) + def _update_thinking_state(self, text: str) -> None: + """ + Update the thinking tag state based on text content. + + Args: + text: Text to analyze for thinking tags + """ + open_count = text.count("") + close_count = text.count("") + self.in_thinking_tag = open_count > close_count or ( + open_count == close_count and text.endswith("")) + + def _is_potential_tag_start(self, text: str) -> bool: + """ + Check if text might be the start of a tool call tag. + + Args: + text: Text to check + + Returns: + True if text could be the start of a tool call tag + """ + for tag in [self.tool_call_start_token, self.tool_call_end_token]: + if any( + tag.startswith(text[-i:]) + for i in range(1, min(len(text) + 1, len(tag)))): + return True + return False + + def _should_buffer_content(self, delta_text: str) -> bool: + """ + Determine if content should be buffered for later processing. + + Args: + delta_text: Delta text to check + + Returns: + True if content should be buffered + """ + if self.in_thinking_tag: + return False + return bool(self.pending_buffer + or self.tool_call_start_token in delta_text + or self.tool_call_end_token in delta_text + or delta_text.startswith('<')) + + def _split_content_for_buffering(self, delta_text: str) -> tuple[str, str]: + """ + Split delta text into safe content and potential tag content. + + Args: + delta_text: Delta text to split + + Returns: + Tuple of (safe_content, potential_tag_content) + """ + if self.in_thinking_tag: + return delta_text, "" + + for tag in [self.tool_call_start_token, self.tool_call_end_token]: + for i in range(1, len(tag)): + tag_prefix = tag[:i] + pos = delta_text.rfind(tag_prefix) + if pos != -1 and tag.startswith(delta_text[pos:]): + return delta_text[:pos], delta_text[pos:] + return delta_text, "" + + def _process_buffer(self, new_content: str) -> str: + """ + Process buffered content and return output content. + + Args: + new_content: New content to add to buffer + + Returns: + Processed output content + """ + self.pending_buffer += new_content + output_content = "" + + if self.in_thinking_tag: + output_content = self.pending_buffer + self.pending_buffer = "" + return output_content + + while self.pending_buffer: + start_pos = self.pending_buffer.find(self.tool_call_start_token) + end_pos = self.pending_buffer.find(self.tool_call_end_token) + + if start_pos != -1 and (end_pos == -1 or start_pos < end_pos): + tag_pos, tag_len = start_pos, len(self.tool_call_start_token) + elif end_pos != -1: + tag_pos, tag_len = end_pos, len(self.tool_call_end_token) + else: + if self._is_potential_tag_start(self.pending_buffer): + break + output_content += self.pending_buffer + self.pending_buffer = "" + break + + output_content += self.pending_buffer[:tag_pos] + self.pending_buffer = self.pending_buffer[tag_pos + tag_len:] + + return output_content + + def _reset_streaming_state(self) -> None: + """Reset the streaming state to initial values.""" + self.streaming_state = { + "current_tool_index": -1, + "tool_ids": [], + "sent_tools": [], + } + + def _advance_to_next_tool(self) -> None: + """Advance to the next tool in the streaming sequence.""" + self.streaming_state["current_tool_index"] = int( + self.streaming_state["current_tool_index"]) + 1 + + def _set_current_tool_index(self, index: int) -> None: + """ + Set the current tool index. + + Args: + index: Tool index to set + """ + self.streaming_state["current_tool_index"] = index + + def _get_current_tool_index(self) -> int: + """ + Get the current tool index. + + Returns: + Current tool index + """ + return int(self.streaming_state["current_tool_index"]) + + def _get_next_unsent_tool_index(self, tool_count: int) -> int: + """ + Get the index of the next unsent tool. + + Args: + tool_count: Total number of tools + + Returns: + Index of next unsent tool, or -1 if all tools sent + """ + sent_tools = list(self.streaming_state["sent_tools"]) + for i in range(tool_count): + if i < len(sent_tools): + if not sent_tools[i]["sent_name"]: + return i + else: + return i + return -1 + + def _ensure_state_arrays(self, tool_count: int) -> None: + """ + Ensure state arrays have sufficient capacity for tool_count tools. + + Args: + tool_count: Number of tools to prepare for + """ + sent_tools = list(self.streaming_state["sent_tools"]) + tool_ids = list(self.streaming_state["tool_ids"]) + + while len(sent_tools) < tool_count: + sent_tools.append({ + "sent_name": False, + "sent_arguments": "", + "id": random_tool_call_id(), + }) + + while len(tool_ids) < tool_count: + tool_ids.append(None) + + self.streaming_state["sent_tools"] = sent_tools + self.streaming_state["tool_ids"] = tool_ids + + def _detect_tools_in_text(self, text: str) -> int: + """ + Detect the number of tools in text by counting name patterns. + + Args: + text: Text to analyze + + Returns: + Number of tools detected + """ + matches = self.tool_name_pattern.findall(text) + return len(matches) + + def _find_tool_boundaries(self, text: str) -> list[tuple[int, int]]: + """ + Find the boundaries of tool calls in text. + + Args: + text: Text to analyze + + Returns: + List of (start, end) positions for tool calls + """ + boundaries = [] + i = 0 + while i < len(text): + if text[i] == '{': + start = i + depth = 0 + has_name = False + has_arguments = False + + while i < len(text): + if text[i] == '{': + depth += 1 + elif text[i] == '}': + depth -= 1 + if depth == 0: + end = i + 1 + segment = text[start:end] + if '"name"' in segment and '"arguments"' in segment: + boundaries.append((start, end)) + break + + if not has_name and '"name"' in text[start:i + 1]: + has_name = True + if not has_arguments and '"arguments"' in text[start:i + + 1]: + has_arguments = True + + i += 1 + + if depth > 0 and has_name: + boundaries.append((start, i)) + else: + i += 1 + return boundaries + + def _extract_tool_args(self, tool_content: str, args_match) -> str: + """ + Extract tool arguments from tool content. + + Args: + tool_content: Tool call content + args_match: Regex match for arguments pattern + + Returns: + Extracted arguments as string + """ + args_start_pos = args_match.end() + remaining_content = tool_content[args_start_pos:] + + if remaining_content.strip().startswith('{'): + depth = 0 + for i, char in enumerate(remaining_content): + if char == '{': + depth += 1 + elif char == '}': + depth -= 1 + if depth == 0: + return remaining_content[:i + 1] + else: + args_end = remaining_content.find('}') + if args_end > 0: + return remaining_content[:args_end].strip() + + return remaining_content.rstrip('}').strip() + + def _get_current_tool_content( + self, text: str, + tool_index: int) -> tuple[Optional[str], Optional[str]]: + """ + Get the content of a specific tool by index. + + Args: + text: Text containing tool calls + tool_index: Index of tool to extract + + Returns: + Tuple of (tool_name, tool_arguments) or (None, None) if not found + """ + boundaries = self._find_tool_boundaries(text) + + if tool_index >= len(boundaries): + return None, None + + start, end = boundaries[tool_index] + tool_content = text[start:end] + + name_match = self.tool_name_pattern.search(tool_content) + name = name_match.group(1) if name_match else None + + args_match = self.tool_args_pattern.search(tool_content) + if args_match: + try: + args_text = self._extract_tool_args(tool_content, args_match) + return name, args_text + except Exception: + remaining_content = tool_content[args_match.end():] + args_text = remaining_content.rstrip('}').strip() + return name, args_text + + return name, None + + def _handle_tool_name_streaming( + self, tool_content: str, + tool_count: int) -> Union[DeltaMessage, None]: + """ + Handle streaming of tool names. + + Args: + tool_content: Content containing tool calls + tool_count: Total number of tools + + Returns: + DeltaMessage with tool name or None if no tool to stream + """ + next_idx = self._get_next_unsent_tool_index(tool_count) + + if next_idx == -1: + return None + + boundaries = self._find_tool_boundaries(tool_content) + if next_idx >= len(boundaries): + return None + + tool_name, _ = self._get_current_tool_content(tool_content, next_idx) + if not tool_name: + return None + + self._set_current_tool_index(next_idx) + sent_tools = list(self.streaming_state["sent_tools"]) + tool_ids = list(self.streaming_state["tool_ids"]) + + tool_id = sent_tools[next_idx]["id"] + tool_ids[next_idx] = tool_id + sent_tools[next_idx]["sent_name"] = True + + self.streaming_state["sent_tools"] = sent_tools + self.streaming_state["tool_ids"] = tool_ids + + return DeltaMessage(tool_calls=[ + DeltaToolCall(index=next_idx, + type="function", + id=tool_id, + function=DeltaFunctionCall( + name=tool_name).model_dump(exclude_none=True)) + ]) + + def _handle_tool_args_streaming( + self, tool_content: str, + tool_count: int) -> Union[DeltaMessage, None]: + """ + Handle streaming of tool arguments. + + Args: + tool_content: Content containing tool calls + tool_count: Total number of tools + + Returns: + DeltaMessage with tool arguments or None if no arguments to stream + """ + current_idx = self._get_current_tool_index() + + if current_idx < 0 or current_idx >= tool_count: + return None + + tool_name, tool_args = self._get_current_tool_content( + tool_content, current_idx) + if not tool_name or tool_args is None: + return None + + sent_tools = list(self.streaming_state["sent_tools"]) + + if not sent_tools[current_idx]["sent_name"]: + return None + + clean_args = self._clean_duplicate_braces(tool_args) + sent_args = sent_tools[current_idx]["sent_arguments"] + + if clean_args != sent_args: + if sent_args and clean_args.startswith(sent_args): + args_delta = extract_intermediate_diff(clean_args, sent_args) + if args_delta: + args_delta = self._clean_delta_braces(args_delta) + sent_tools[current_idx]["sent_arguments"] = clean_args + self.streaming_state["sent_tools"] = sent_tools + + if clean_args.endswith('}'): + self._advance_to_next_tool() + + return DeltaMessage(tool_calls=[ + DeltaToolCall(index=current_idx, + function=DeltaFunctionCall( + arguments=args_delta).model_dump( + exclude_none=True)) + ]) + elif not sent_args and clean_args: + clean_args_delta = self._clean_delta_braces(clean_args) + sent_tools[current_idx]["sent_arguments"] = clean_args + self.streaming_state["sent_tools"] = sent_tools + + if clean_args.endswith('}'): + self._advance_to_next_tool() + + return DeltaMessage(tool_calls=[ + DeltaToolCall(index=current_idx, + function=DeltaFunctionCall( + arguments=clean_args_delta).model_dump( + exclude_none=True)) + ]) + + return None + + def _is_end_tool_calls(self, current_text: str) -> bool: + if self.tool_call_end_token not in current_text: + return False + + end_token_positions = [] + search_start = 0 + while True: + pos = current_text.find(self.tool_call_end_token, search_start) + if pos == -1: + break + end_token_positions.append(pos) + search_start = pos + 1 + + think_regions = [] + for match in re.finditer(self.thinking_tag_pattern, + current_text, + flags=re.DOTALL): + think_regions.append((match.start(), match.end())) + + for pos in end_token_positions: + in_think = any(pos >= t_start and pos < t_end + for t_start, t_end in think_regions) + if not in_think: + return True + + return False + def extract_tool_calls_streaming( self, previous_text: str, @@ -172,13 +674,37 @@ class MinimaxToolParser(ToolParser): delta_token_ids: Sequence[int], request: ChatCompletionRequest, ) -> Union[DeltaMessage, None]: - logger.debug("delta_text: %s", delta_text) - logger.debug("delta_token_ids: %s", delta_token_ids) + self._update_thinking_state(current_text) + + if self.in_thinking_tag: + return DeltaMessage(content=delta_text) + + if self._should_buffer_content(delta_text): + buffered_output = self._process_buffer(delta_text) + return DeltaMessage( + content=buffered_output) if buffered_output else None + + if self._is_end_tool_calls(current_text): + return DeltaMessage(content=delta_text) + + safe_content, potential_tag = self._split_content_for_buffering( + delta_text) + if potential_tag: + self.pending_buffer += potential_tag + return DeltaMessage(content=safe_content) if safe_content else None - # Preprocess to remove tool calls from thinking tags processed_current_text = self.preprocess_model_output(current_text) if self.tool_call_start_token not in processed_current_text: + if (self.tool_call_end_token in delta_text + and self.tool_call_start_token in current_text): + return None + if delta_text.strip( + ) == '' and self.tool_call_start_token in current_text: + return None + if (self._get_current_tool_index() != -1 + and self.tool_call_end_token in current_text): + self._reset_streaming_state() return DeltaMessage(content=delta_text) if (self.tool_call_start_token_id is not None @@ -186,184 +712,104 @@ class MinimaxToolParser(ToolParser): and len(delta_token_ids) == 1): return None - original_tool_call_start_pos = current_text.find( - self.tool_call_start_token) - if original_tool_call_start_pos > 0: - delta_start_pos = len(current_text) - len(delta_text) - if delta_start_pos < original_tool_call_start_pos: - content_part = delta_text - if delta_start_pos + len( - delta_text) > original_tool_call_start_pos: - content_part = delta_text[:original_tool_call_start_pos - - delta_start_pos] - if content_part: - return DeltaMessage(content=content_part) + original_tool_start = self._find_tool_start_outside_thinking( + current_text) + if original_tool_start is None: + return None - flags = Allow.ALL if self.current_tool_name_sent \ - else Allow.ALL & ~Allow.STR + content_before_tools = self._extract_content_before_tools( + current_text, delta_text, original_tool_start) + if content_before_tools: + return DeltaMessage(content=content_before_tools) try: - parsable_content = processed_current_text.split( - self.tool_call_start_token)[-1].split( - self.tool_call_end_token)[0] + tool_content = self._extract_tool_content(current_text, + original_tool_start) + current_tools_count = self._detect_tools_in_text(tool_content) - tool_call_arr = [] - if parsable_content.strip(): - lines = parsable_content.strip().split('\n') - for line in lines: - line = line.strip() - if line and (line.startswith('{') or '"name"' in line): - try: - if line.endswith('}'): - parsed_call = json.loads(line) - tool_call_arr.append(parsed_call) - else: - parsed_call = partial_json_parser.loads( - line, flags) - if parsed_call and isinstance( - parsed_call, dict): - tool_call_arr.append(parsed_call) - except (json.JSONDecodeError, partial_json_parser.core. - exceptions.MalformedJSON): - continue - - current_tool_call: dict = tool_call_arr[self.current_tool_id] \ - if len(tool_call_arr) > self.current_tool_id >= 0 else {} - - if len(tool_call_arr) == 0: + if current_tools_count == 0: return None - # Starting a new tool in the array - elif (len(tool_call_arr) > 0 - and len(tool_call_arr) > self.current_tool_id + 1): + if self._get_current_tool_index() == -1: + self._reset_streaming_state() - # Handle any missed arguments from previous tool - if self.current_tool_id >= 0 and self.current_tool_id < len( - self.prev_tool_call_arr): - prev_tool_call = self.prev_tool_call_arr[ - self.current_tool_id] - diff_arguments = prev_tool_call.get("arguments") + self._ensure_state_arrays(current_tools_count) - if diff_arguments: - diff_arguments_json = json.dumps(diff_arguments, - ensure_ascii=False) - already_streamed = self.streamed_args_for_tool[ - self. - current_tool_id] if self.current_tool_id < len( - self.streamed_args_for_tool) else "" - - if diff_arguments_json != already_streamed: - diff = diff_arguments_json[len(already_streamed):] - delta = DeltaMessage(tool_calls=[ - DeltaToolCall(index=self.current_tool_id, - function=DeltaFunctionCall( - arguments=diff).model_dump( - exclude_none=True)) - ]) - if self.current_tool_id < len( - self.streamed_args_for_tool): - self.streamed_args_for_tool[ - self.current_tool_id] = diff_arguments_json - else: - delta = None - else: - delta = None - else: - delta = None - - self.current_tool_id = len(tool_call_arr) - 1 - self.current_tool_name_sent = False - self.streamed_args_for_tool.append("") - logger.debug("starting on new tool %d", self.current_tool_id) - return delta - - # Send tool name if not sent yet - if not self.current_tool_name_sent: - function_name = current_tool_call.get("name") - if function_name: - delta = DeltaMessage(tool_calls=[ - DeltaToolCall(index=self.current_tool_id, - type="function", - id=random_tool_call_id(), - function=DeltaFunctionCall( - name=function_name).model_dump( - exclude_none=True)) - ]) - self.current_tool_name_sent = True - else: - delta = None - - # Stream arguments - else: - prev_arguments = None - if (self.current_tool_id < len(self.prev_tool_call_arr) - and self.prev_tool_call_arr[self.current_tool_id]): - prev_arguments = self.prev_tool_call_arr[ - self.current_tool_id].get("arguments") - - cur_arguments = current_tool_call.get("arguments") - - if not cur_arguments and not prev_arguments: - delta = None - elif not cur_arguments and prev_arguments: - logger.error( - "Arguments reset mid-call, skipping streaming") - delta = None - elif cur_arguments and not prev_arguments: - cur_arguments_json = json.dumps(cur_arguments, - ensure_ascii=False) - logger.debug("First tokens in arguments received: %s", - cur_arguments_json) - - delta = DeltaMessage(tool_calls=[ - DeltaToolCall(index=self.current_tool_id, - function=DeltaFunctionCall( - arguments=cur_arguments_json). - model_dump(exclude_none=True)) - ]) - self.streamed_args_for_tool[ - self.current_tool_id] = cur_arguments_json - - elif cur_arguments and prev_arguments: - cur_args_json = json.dumps(cur_arguments, - ensure_ascii=False) - prev_args_json = json.dumps(prev_arguments, - ensure_ascii=False) - - logger.debug("Searching for diff between \n%s\n%s", - cur_args_json, prev_args_json) - - already_streamed = self.streamed_args_for_tool[ - self.current_tool_id] if self.current_tool_id < len( - self.streamed_args_for_tool) else "" - - if cur_args_json.startswith(already_streamed): - argument_diff = cur_args_json[len(already_streamed):] - elif cur_args_json != already_streamed: - argument_diff = cur_args_json - self.streamed_args_for_tool[self.current_tool_id] = "" - else: - argument_diff = "" - - if argument_diff: - logger.debug("got arguments diff: %s", argument_diff) - delta = DeltaMessage(tool_calls=[ - DeltaToolCall(index=self.current_tool_id, - function=DeltaFunctionCall( - arguments=argument_diff). - model_dump(exclude_none=True)) - ]) - self.streamed_args_for_tool[ - self.current_tool_id] += argument_diff - else: - delta = None - else: - delta = None - - self.prev_tool_call_arr = tool_call_arr - return delta + return (self._handle_tool_name_streaming(tool_content, + current_tools_count) + or self._handle_tool_args_streaming( + tool_content, current_tools_count)) except Exception: - logger.exception("An unexpected error occurred", + logger.exception("An unexpected error occurred ", "during streaming tool call handling.") return None + + def _find_tool_start_outside_thinking(self, + current_text: str) -> Optional[int]: + """ + Find the start position of tool calls outside of thinking tags. + + Args: + current_text: Current text to search + + Returns: + Position of tool call start or None if not found + """ + search_start = 0 + while True: + pos = current_text.find(self.tool_call_start_token, search_start) + if pos == -1: + return None + + think_regions = [(m.start(), m.end()) for m in re.finditer( + r"(.*?)", current_text, flags=re.DOTALL)] + in_think = any(pos >= t_start and pos < t_end + for t_start, t_end in think_regions) + + if not in_think: + return pos + + search_start = pos + 1 + + def _extract_content_before_tools(self, current_text: str, delta_text: str, + tool_start: int) -> Optional[str]: + """ + Extract content that appears before tool calls. + + Args: + current_text: Current text + delta_text: Delta text + tool_start: Start position of tools + + Returns: + Content before tools or None + """ + if tool_start > 0: + delta_start_pos = len(current_text) - len(delta_text) + if delta_start_pos < tool_start: + content_part = delta_text + if delta_start_pos + len(delta_text) > tool_start: + content_part = delta_text[:tool_start - delta_start_pos] + return content_part if content_part else None + return None + + def _extract_tool_content(self, current_text: str, tool_start: int) -> str: + """ + Extract tool content from current text starting at tool_start. + + Args: + current_text: Current text + tool_start: Start position of tool calls + + Returns: + Extracted tool content + """ + tool_content_start = tool_start + len(self.tool_call_start_token) + tool_content = current_text[tool_content_start:] + + end_pos = tool_content.find(self.tool_call_end_token) + if end_pos != -1: + tool_content = tool_content[:end_pos] + + return tool_content From 609b533cb6f25f599fda94598bba446396498632 Mon Sep 17 00:00:00 2001 From: Syed Muhammad Bin Asif <92625830+syedmba@users.noreply.github.com> Date: Thu, 7 Aug 2025 11:31:03 +0800 Subject: [PATCH 045/932] [Bugfix] Add proper comparison for package versions (#22314) Signed-off-by: Syed Muhammad Bin Asif --- benchmarks/kernels/benchmark_bitblas.py | 4 +++- docs/design/arch_overview.md | 3 ++- vllm/attention/ops/triton_decode_attention.py | 4 +++- vllm/model_executor/layers/quantization/bitblas.py | 4 +++- vllm/model_executor/layers/quantization/bitsandbytes.py | 7 +++++-- vllm/model_executor/layers/quantization/deepspeedfp.py | 3 ++- vllm/model_executor/layers/quantization/gptq_bitblas.py | 4 +++- vllm/model_executor/layers/quantization/ipex_quant.py | 7 +++++-- .../layers/quantization/kernels/mixed_precision/bitblas.py | 4 +++- .../layers/quantization/utils/bitblas_utils.py | 4 +++- .../model_executor/layers/quantization/utils/w8a8_utils.py | 5 +++-- vllm/model_executor/model_loader/bitsandbytes_loader.py | 4 +++- vllm/v1/sample/ops/topk_topp_sampler.py | 3 ++- 13 files changed, 40 insertions(+), 16 deletions(-) diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py index 97ee060341..66b44c27d6 100644 --- a/benchmarks/kernels/benchmark_bitblas.py +++ b/benchmarks/kernels/benchmark_bitblas.py @@ -3,6 +3,8 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +from packaging import version + from vllm.model_executor.layers.quantization.utils.bitblas_utils import ( MINIMUM_BITBLAS_VERSION, ) @@ -10,7 +12,7 @@ from vllm.model_executor.layers.quantization.utils.bitblas_utils import ( try: import bitblas - if bitblas.__version__ < MINIMUM_BITBLAS_VERSION: + if version.parse(bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION): raise ImportError( "bitblas version is wrong. Please " f"install bitblas>={MINIMUM_BITBLAS_VERSION}" diff --git a/docs/design/arch_overview.md b/docs/design/arch_overview.md index 334df5dc9b..6b70867760 100644 --- a/docs/design/arch_overview.md +++ b/docs/design/arch_overview.md @@ -200,7 +200,8 @@ vision-language model. lora_config = vllm_config.lora_config super().__init__(config, cache_config, quant_config, lora_config, prefix) - if __version__ >= "0.6.4": + from packaging import version + if version.parse(__version__) >= version.parse("0.6.4"): MyModel = MyNewModel else: MyModel = MyOldModel diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py index c27b377aeb..f82ce5b4d4 100644 --- a/vllm/attention/ops/triton_decode_attention.py +++ b/vllm/attention/ops/triton_decode_attention.py @@ -31,6 +31,8 @@ It supports page size >= 1. import logging +from packaging import version + from vllm.platforms import current_platform from vllm.triton_utils import tl, triton @@ -40,7 +42,7 @@ logger = logging.getLogger(__name__) # Only print the following warnings when triton version < 3.2.0. # The issue won't affect performance or accuracy. -if triton.__version__ < '3.2.0': +if version.parse(triton.__version__) < version.parse('3.2.0'): logger.warning( "The following error message 'operation scheduled before its operands' " "can be ignored.") diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py index aa8eee88a9..39bd34d351 100644 --- a/vllm/model_executor/layers/quantization/bitblas.py +++ b/vllm/model_executor/layers/quantization/bitblas.py @@ -3,6 +3,7 @@ from typing import Any, Optional import torch +from packaging import version from vllm.logger import init_logger from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase @@ -45,7 +46,8 @@ class BitBLASConfig(QuantizationConfig): ) -> None: try: import bitblas - if bitblas.__version__ < MINIMUM_BITBLAS_VERSION: + if version.parse(bitblas.__version__) < version.parse( + MINIMUM_BITBLAS_VERSION): raise ImportError( "bitblas version is wrong. Please " f"install bitblas>={MINIMUM_BITBLAS_VERSION}") diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 5359189caa..0204ff4685 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -4,6 +4,7 @@ from typing import Any, Callable, Optional, Union import torch +from packaging import version from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, FusedMoEMethodBase) @@ -169,7 +170,8 @@ class BitsAndBytesLinearMethod(LinearMethodBase): def __init__(self, quant_config: BitsAndBytesConfig): try: import bitsandbytes - if bitsandbytes.__version__ < "0.46.1": + if version.parse( + bitsandbytes.__version__) < version.parse("0.46.1"): raise ImportError("bitsandbytes version is wrong. Please " "install bitsandbytes>=0.46.1.") except ImportError as err: @@ -412,7 +414,8 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): def __init__(self, quant_config: BitsAndBytesConfig): try: import bitsandbytes - if bitsandbytes.__version__ < "0.46.1": + if version.parse( + bitsandbytes.__version__) < version.parse("0.46.1"): raise ImportError("bitsandbytes version is wrong. Please " "install bitsandbytes>=0.46.1.") except ImportError as err: diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py index 8030be5259..2922aef329 100644 --- a/vllm/model_executor/layers/quantization/deepspeedfp.py +++ b/vllm/model_executor/layers/quantization/deepspeedfp.py @@ -6,6 +6,7 @@ from typing import Any, Optional import torch import torch.nn as nn import torch.nn.functional as F +from packaging import version from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase from vllm.model_executor.layers.quantization import QuantizationMethods @@ -145,7 +146,7 @@ class DeepSpeedFPParameter(nn.Parameter): quant_config: DeepSpeedFPConfig): try: import deepspeed - if deepspeed.__version__ < "0.14.2": + if version.parse(deepspeed.__version__) < version.parse("0.14.2"): raise ImportError("deepspeed version is wrong. Please " "install deepspeed>=0.14.2.") from deepspeed.ops.fp_quantizer import FP_Quantize diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py index caeb266d0b..d03074f861 100644 --- a/vllm/model_executor/layers/quantization/gptq_bitblas.py +++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py @@ -3,6 +3,7 @@ from typing import Any, Optional import torch +from packaging import version from torch.nn.parameter import Parameter from vllm.logger import init_logger @@ -63,7 +64,8 @@ class GPTQBitBLASConfig(QuantizationConfig): try: import bitblas - if bitblas.__version__ < MINIMUM_BITBLAS_VERSION: + if version.parse(bitblas.__version__) < version.parse( + MINIMUM_BITBLAS_VERSION): raise ImportError( "bitblas version is wrong. Please " f"install bitblas>={MINIMUM_BITBLAS_VERSION}") diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index 428e9b882b..9c458954f9 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -4,6 +4,7 @@ from typing import Any, Optional import torch +from packaging import version from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) @@ -135,7 +136,8 @@ class IPEXGPTQLinearMethod(GPTQLinearMethod): try: import intel_extension_for_pytorch as ipex - if ipex.__version__ < MIN_IPEX_VERSION: + if version.parse( + ipex.__version__) < version.parse(MIN_IPEX_VERSION): raise ImportError( "intel_extension_for_pytorch version is " "wrong. Please install " @@ -199,7 +201,8 @@ class IPEXAWQLinearMethod(AWQLinearMethod): try: import intel_extension_for_pytorch as ipex - if ipex.__version__ < MIN_IPEX_VERSION: + if version.parse( + ipex.__version__) < version.parse(MIN_IPEX_VERSION): raise ImportError( "intel_extension_for_pytorch version is " "wrong. Please install " diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py index 649d07b4d0..0eca3b4c02 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py @@ -4,6 +4,7 @@ from typing import Optional import torch +from packaging import version from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import ( @@ -110,7 +111,8 @@ class BitBLASLinearKernel(MPLinearKernel): try: import bitblas - if bitblas.__version__ < MINIMUM_BITBLAS_VERSION: + if version.parse(bitblas.__version__) < version.parse( + MINIMUM_BITBLAS_VERSION): raise ImportError( "bitblas version is wrong. Please " f"install bitblas>={MINIMUM_BITBLAS_VERSION}") diff --git a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py index 82ee3edfd5..4c2e548735 100644 --- a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py +++ b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py @@ -3,6 +3,7 @@ from typing import Optional import torch +from packaging import version from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types @@ -75,7 +76,8 @@ def _check_bitblas_supported( # Finally, check if bitblas is installed try: import bitblas - if bitblas.__version__ < MINIMUM_BITBLAS_VERSION: + if version.parse( + bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION): raise ImportError("bitblas version is wrong. Please " f"install bitblas>={MINIMUM_BITBLAS_VERSION}") except ImportError: diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 47bb457932..ddb5096890 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -4,6 +4,7 @@ from typing import Callable, Optional, Union import torch +from packaging import version from vllm import _custom_ops as ops from vllm import envs @@ -21,8 +22,8 @@ TORCH_DEVICE_IDENTITY = None # torch._scaled_mm rowwise feature. # The condition is determined once as the operations # are time consuming. -USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm() - and torch.__version__[0:3] >= "2.7" +USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm() and version.parse( + torch.__version__) >= version.parse("2.7") and current_platform.has_device_capability(94)) diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index f54dfab523..ea2fb2e3ac 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -12,6 +12,7 @@ from typing import Any, Callable, Optional import numpy as np import torch from huggingface_hub import HfApi +from packaging import version from torch import nn from transformers.utils import SAFE_WEIGHTS_INDEX_NAME @@ -193,7 +194,8 @@ class BitsAndBytesModelLoader(BaseModelLoader): try: import bitsandbytes - if bitsandbytes.__version__ < "0.46.1": + if version.parse( + bitsandbytes.__version__) < version.parse("0.46.1"): raise ImportError("bitsandbytes version is wrong. Please " "install bitsandbytes>=0.46.1.") except ImportError as err: diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 460e1c0b05..e0434c8f3d 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -5,6 +5,7 @@ from typing import Optional import torch import torch.nn as nn +from packaging import version from vllm import envs from vllm.logger import init_logger @@ -32,7 +33,7 @@ class TopKTopPSampler(nn.Module): if current_platform.is_cuda(): if is_flashinfer_available: flashinfer_version = flashinfer.__version__ - if flashinfer_version < "0.2.3": + if version.parse(flashinfer_version) < version.parse("0.2.3"): logger.warning_once( "FlashInfer version >= 0.2.3 required. " "Falling back to default sampling implementation.") From ecbea55ca254186ed6cbf62702242d73c177a75f Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 7 Aug 2025 04:31:41 +0100 Subject: [PATCH 046/932] Update `hf_xet` pin to resolve hangs (#22356) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- requirements/common.txt | 1 - requirements/nightly_torch_test.txt | 1 - requirements/test.in | 1 - requirements/test.txt | 3 +-- 4 files changed, 1 insertion(+), 5 deletions(-) diff --git a/requirements/common.txt b/requirements/common.txt index 5405df359a..5c422500e1 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -8,7 +8,6 @@ tqdm blake3 py-cpuinfo transformers >= 4.55.0 -huggingface-hub[hf_xet] >= 0.33.0 # Required for Xet downloads. tokenizers >= 0.21.1 # Required for fast incremental detokenization. protobuf # Required by LlamaTokenizer. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index 7ae5e6f2f4..491fa06259 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -31,7 +31,6 @@ lm-eval[api]==0.4.8 # required for model evaluation test mteb>=1.38.11, <2 # required for mteb test transformers==4.52.4 tokenizers==0.21.1 -huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads. schemathesis>=3.39.15 # Required for openai schema test. # quantization bitsandbytes>=0.46.1 diff --git a/requirements/test.in b/requirements/test.in index 9c8c75dd6f..1e0cab80a2 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -37,7 +37,6 @@ lm-eval[api]==0.4.8 # required for model evaluation test mteb[bm25s]>=1.38.11, <2 # required for mteb test transformers==4.55.0 tokenizers==0.21.1 -huggingface-hub[hf_xet]>=0.33.0 # Required for Xet downloads. schemathesis>=3.39.15 # Required for openai schema test. # quantization bitsandbytes==0.46.1 diff --git a/requirements/test.txt b/requirements/test.txt index 08ba964f22..324f8153b2 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -276,7 +276,7 @@ h5py==3.13.0 # via terratorch harfile==0.3.0 # via schemathesis -hf-xet==1.1.3 +hf-xet==1.1.7 # via huggingface-hub hiredis==3.0.0 # via tensorizer @@ -288,7 +288,6 @@ httpx==0.27.2 # schemathesis huggingface-hub==0.34.3 # via - # -r requirements/test.in # accelerate # datasets # evaluate From 14bcf93a6a59072fd5bc542d0ad73c54546cef5c Mon Sep 17 00:00:00 2001 From: "ZiTian.Zhao" Date: Thu, 7 Aug 2025 11:32:19 +0800 Subject: [PATCH 047/932] Optimize logger init performance by using module-level constants (#22373) Signed-off-by: zitian.zhao --- vllm/logger.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/vllm/logger.py b/vllm/logger.py index 69aaf4390a..8f06eb03c7 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -102,6 +102,14 @@ class _VllmLogger(Logger): _print_warning_once(self, msg, *args) +# Pre-defined methods mapping to avoid repeated dictionary creation +_METHODS_TO_PATCH = { + "debug_once": _print_debug_once, + "info_once": _print_info_once, + "warning_once": _print_warning_once, +} + + def _configure_vllm_root_logger() -> None: logging_config = dict[str, Any]() @@ -144,13 +152,7 @@ def init_logger(name: str) -> _VllmLogger: logger = logging.getLogger(name) - methods_to_patch = { - "debug_once": _print_debug_once, - "info_once": _print_info_once, - "warning_once": _print_warning_once, - } - - for method_name, method in methods_to_patch.items(): + for method_name, method in _METHODS_TO_PATCH.items(): setattr(logger, method_name, MethodType(method, logger)) return cast(_VllmLogger, logger) From ad6c655dde487c256292ad85a538cdf5133ee28b Mon Sep 17 00:00:00 2001 From: Lionel Villard Date: Wed, 6 Aug 2025 23:33:24 -0400 Subject: [PATCH 048/932] preload heavy modules when mp method is forkserver (#22214) Signed-off-by: Lionel Villard --- vllm/benchmarks/latency.py | 4 +++- vllm/entrypoints/openai/api_server.py | 10 ++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index cebdf56c45..05378ec74d 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -13,7 +13,6 @@ import numpy as np from tqdm import tqdm import vllm.envs as envs -from vllm import LLM, SamplingParams from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, write_to_json) from vllm.engine.arg_utils import EngineArgs @@ -85,6 +84,9 @@ def main(args: argparse.Namespace): "Please set it to a valid path to use torch profiler.") engine_args = EngineArgs.from_cli_args(args) + # Lazy import to avoid importing LLM when the bench command is not selected. + from vllm import LLM, SamplingParams + # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. llm = LLM(**dataclasses.asdict(engine_args)) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 88ef16b87e..f6f83223a1 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -8,6 +8,7 @@ import importlib import inspect import json import multiprocessing +import multiprocessing.forkserver as forkserver import os import signal import socket @@ -155,6 +156,15 @@ async def build_async_engine_client( client_config: Optional[dict[str, Any]] = None, ) -> AsyncIterator[EngineClient]: + if os.getenv("VLLM_WORKER_MULTIPROC_METHOD") == "forkserver": + # The executor is expected to be mp. + # Pre-import heavy modules in the forkserver process + logger.debug("Setup forkserver with pre-imports") + multiprocessing.set_start_method('forkserver') + multiprocessing.set_forkserver_preload(["vllm.v1.engine.async_llm"]) + forkserver.ensure_running() + logger.debug("Forkserver setup complete!") + # Context manager to handle engine_client lifecycle # Ensures everything is shutdown and cleaned up on error/exit engine_args = AsyncEngineArgs.from_cli_args(args) From f6278b6243079784dc71e63244f6de38a47bf6c2 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 6 Aug 2025 20:56:02 -0700 Subject: [PATCH 049/932] [gpt-oss] Convert user input to harmony format (#22402) Signed-off-by: Chen Zhang Co-authored-by: Woosuk Kwon --- vllm/entrypoints/chat_utils.py | 4 +- vllm/entrypoints/harmony_utils.py | 60 ++++++- vllm/entrypoints/openai/protocol.py | 9 +- vllm/entrypoints/openai/serving_responses.py | 158 +++++++++++++++++-- 4 files changed, 216 insertions(+), 15 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index a658d97cc8..74c8093f49 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -29,6 +29,7 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam, from openai.types.chat.chat_completion_content_part_input_audio_param import ( InputAudio) from openai.types.responses import ResponseInputImageParam +from openai_harmony import Message as OpenAIHarmonyMessage from PIL import Image from pydantic import BaseModel, ConfigDict, TypeAdapter # yapf: enable @@ -207,7 +208,8 @@ class CustomChatCompletionMessageParam(TypedDict, total=False): ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam, - CustomChatCompletionMessageParam] + CustomChatCompletionMessageParam, + OpenAIHarmonyMessage] # TODO: Make fields ReadOnly once mypy supports it diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index ecda35c980..ee08d62b57 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -2,14 +2,18 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import datetime from collections.abc import Iterable, Sequence -from typing import Literal, Optional +from typing import Literal, Optional, Union +from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem from openai.types.responses.tool import Tool -from openai_harmony import (Conversation, DeveloperContent, +from openai_harmony import (Author, Conversation, DeveloperContent, HarmonyEncodingName, Message, ReasoningEffort, Role, StreamableParser, SystemContent, TextContent, ToolDescription, load_harmony_encoding) +from vllm.entrypoints.openai.protocol import (ResponseInputOutputItem, + ResponseReasoningItem) + REASONING_EFFORT = { "high": ReasoningEffort.HIGH, "medium": ReasoningEffort.MEDIUM, @@ -85,6 +89,58 @@ def get_user_message(content: str) -> Message: return Message.from_role_and_content(Role.USER, content) +def parse_response_input( + response_msg: ResponseInputOutputItem, + prev_responses: list[Union[ResponseOutputItem, ResponseReasoningItem]] +) -> Message: + if not isinstance(response_msg, dict): + response_msg = response_msg.model_dump() + if "type" not in response_msg or response_msg["type"] == "message": + role = response_msg["role"] + content = response_msg["content"] + if role == "system": + # User is trying to set a system message. Change it to: + # <|start|>developer<|message|># Instructions + # {instructions}<|end|> + role = "developer" + text_prefix = "Instructions:\n" + else: + text_prefix = "" + if isinstance(content, str): + msg = Message.from_role_and_content(role, text_prefix + content) + else: + contents = [ + TextContent(text=text_prefix + c["text"]) for c in content + ] + msg = Message.from_role_and_contents(role, contents) + elif response_msg["type"] == "function_call_output": + call_id = response_msg["call_id"] + call_response: Optional[ResponseFunctionToolCall] = None + for prev_response in reversed(prev_responses): + if isinstance(prev_response, ResponseFunctionToolCall + ) and prev_response.call_id == call_id: + call_response = prev_response + break + if call_response is None: + raise ValueError(f"No call message found for {call_id}") + msg = Message.from_author_and_content( + Author.new(Role.TOOL, f"functions.{call_response.name}"), + response_msg["output"]) + elif response_msg["type"] == "reasoning": + content = response_msg["content"] + assert len(content) == 1 + msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"]) + elif response_msg["type"] == "function_call": + msg = Message.from_role_and_content(Role.ASSISTANT, + response_msg["arguments"]) + msg = msg.with_channel("commentary") + msg = msg.with_recipient(f"functions.{response_msg['name']}") + msg = msg.with_content_type("json") + else: + raise ValueError(f"Unknown input type: {response_msg['type']}") + return msg + + def parse_chat_input(chat_msg) -> Message: role = chat_msg["role"] content = chat_msg["content"] diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 57aa427207..421927d61b 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -17,7 +17,8 @@ from openai.types.chat.chat_completion_audio import ( from openai.types.chat.chat_completion_message import ( Annotation as OpenAIAnnotation) # yapf: enable -from openai.types.responses import (ResponseInputParam, ResponseOutputItem, +from openai.types.responses import (ResponseFunctionToolCall, + ResponseInputItemParam, ResponseOutputItem, ResponseOutputMessage, ResponsePrompt, ResponseStatus, ResponseTextConfig) from openai.types.responses.response import ToolChoice @@ -234,6 +235,10 @@ def get_logits_processors(processors: Optional[LogitsProcessors], return None +ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam, + ResponseFunctionToolCall] + + class ResponsesRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/responses/create @@ -248,7 +253,7 @@ class ResponsesRequest(OpenAIBaseModel): "reasoning.encrypted_content", ], ]] = None - input: Union[str, ResponseInputParam] + input: Union[str, list[ResponseInputOutputItem]] instructions: Optional[str] = None max_output_tokens: Optional[int] = None max_tool_calls: Optional[int] = None diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 4ca863fd07..3c0b590b0c 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -4,12 +4,15 @@ import asyncio import time from collections.abc import AsyncGenerator, AsyncIterator +from copy import copy from http import HTTPStatus from typing import Callable, Final, Optional, Union import jinja2 from fastapi import Request -from openai.types.responses import ResponseOutputMessage, ResponseOutputText +from openai.types.responses import (ResponseFunctionToolCall, + ResponseOutputMessage, ResponseOutputText) +from openai_harmony import Message as OpenAIHarmonyMessage from vllm import envs from vllm.config import ModelConfig @@ -17,6 +20,10 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, ChatTemplateContentFormatOption) from vllm.entrypoints.context import ConversationContext, SimpleContext +from vllm.entrypoints.harmony_utils import ( + get_developer_message, get_stop_tokens_for_assistant_actions, + get_system_message, get_user_message, parse_response_input, + render_for_completion) from vllm.entrypoints.logger import RequestLogger # yapf conflicts with isort for this block # yapf: disable @@ -30,6 +37,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.tool_server import ToolServer +from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.logger import init_logger from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import SamplingParams @@ -103,6 +111,29 @@ class OpenAIServingResponses(OpenAIServing): "`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may " "cause a memory leak since we never remove responses from " "the store.") + + self.use_harmony = model_config.hf_config.model_type == "gpt_oss" + if self.use_harmony: + logger.warning("For gpt-oss, we ignore --enable-auto-tool-choice " + "and always enable tool use.") + # OpenAI models have two EOS-like tokens: <|return|> and <|call|>. + # We need to add them to the stop token ids. + if "stop_token_ids" not in self.default_sampling_params: + self.default_sampling_params["stop_token_ids"] = [] + self.default_sampling_params["stop_token_ids"].extend( + get_stop_tokens_for_assistant_actions()) + + # set up tool use + self.enable_auto_tools: bool = enable_auto_tools + if self.enable_auto_tools: + logger.info( + "\"auto\" tool choice has been enabled please note that while" + " the parallel_tool_calls client option is preset for " + "compatibility reasons, it will be ignored.") + if not self.use_harmony: + raise NotImplementedError("Auto tool choice is not supported " + "yet unless using Harmony") + # HACK(woosuk): This is a hack. We should use a better store. # FIXME: If enable_store=True, this may cause a memory leak since we # never remove responses from the store. @@ -165,21 +196,20 @@ class OpenAIServingResponses(OpenAIServing): return self._make_not_found_error(prev_response_id) else: prev_response = None - # Construct the input messages. - messages = self._construct_input_messages(request, prev_response) try: lora_request = self._maybe_get_adapters(request) model_name = self._get_model_name(request.model, lora_request) tokenizer = await self.engine_client.get_tokenizer(lora_request) - _, request_prompts, engine_prompts = await self._preprocess_chat( - request, - tokenizer, - messages, - chat_template=self.chat_template, - chat_template_content_format=self.chat_template_content_format, - ) + if self.use_harmony: + messages, request_prompts, engine_prompts = ( + self._make_request_with_harmony(request, prev_response)) + else: + messages, request_prompts, engine_prompts = ( + await self._make_request(request, prev_response, + tokenizer)) + except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: logger.exception("Error in preprocessing prompt inputs") @@ -275,6 +305,38 @@ class OpenAIServingResponses(OpenAIServing): except Exception as e: return self.create_error_response(str(e)) + async def _make_request( + self, + request: ResponsesRequest, + prev_response: Optional[ResponsesResponse], + tokenizer: AnyTokenizer, + ): + # Construct the input messages. + messages = self._construct_input_messages(request, prev_response) + _, request_prompts, engine_prompts = await self._preprocess_chat( + request, + tokenizer, + messages, + chat_template=self.chat_template, + chat_template_content_format=self.chat_template_content_format, + ) + return messages, request_prompts, engine_prompts + + def _make_request_with_harmony( + self, + request: ResponsesRequest, + prev_response: Optional[ResponsesResponse], + ): + if request.tool_choice != "auto": + raise NotImplementedError( + "Only 'auto' tool_choice is supported in " + "response API with Harmony") + messages = self._construct_input_messages_with_harmony( + request, prev_response) + prompt_token_ids = render_for_completion(messages) + engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids) + return messages, [prompt_token_ids], [engine_prompt] + async def responses_full_generator( self, request: ResponsesRequest, @@ -411,6 +473,82 @@ class OpenAIServingResponses(OpenAIServing): messages.extend(request.input) # type: ignore return messages + def _construct_input_messages_with_harmony( + self, + request: ResponsesRequest, + prev_response: Optional[ResponsesResponse], + ) -> list[OpenAIHarmonyMessage]: + messages: list[OpenAIHarmonyMessage] = [] + if prev_response is None: + # New conversation. + reasoning_effort = (request.reasoning.effort + if request.reasoning else None) + tool_types = [tool.type for tool in request.tools] + enable_browser = ("web_search_preview" in tool_types + and self.tool_server is not None + and self.tool_server.has_tool("browser")) + enable_code_interpreter = ("code_interpreter" in tool_types + and self.tool_server is not None + and self.tool_server.has_tool("python")) + sys_msg = get_system_message( + reasoning_effort=reasoning_effort, + browser_description=self.tool_server.get_tool_description( + "browser") + if enable_browser and self.tool_server is not None else None, + python_description=self.tool_server.get_tool_description( + "python") if enable_code_interpreter + and self.tool_server is not None else None, + ) + messages.append(sys_msg) + dev_msg = get_developer_message(request.instructions, + request.tools) + messages.append(dev_msg) + else: + # Continue the previous conversation. + # FIXME(woosuk): Currently, request params like reasoning and + # instructions are ignored. + prev_msgs = self.msg_store[prev_response.id] + # Remove the previous chain-of-thoughts if there is a new "final" + # message. Note that this also removes these messages from the + # msg_store. + if len(prev_msgs) > 0: + last_msg = prev_msgs[-1] + assert isinstance(last_msg, OpenAIHarmonyMessage) + if last_msg.channel == "final": + prev_final_msg_idx = -1 + for i in range(len(prev_msgs) - 2, -1, -1): + prev_msg_i = prev_msgs[i] + assert isinstance(prev_msg_i, OpenAIHarmonyMessage) + if prev_msg_i.channel == "final": + prev_final_msg_idx = i + break + recent_turn_msgs = prev_msgs[prev_final_msg_idx + 1:] + del prev_msgs[prev_final_msg_idx + 1:] + for msg in recent_turn_msgs: + assert isinstance(msg, OpenAIHarmonyMessage) + if msg.channel != "analysis": + prev_msgs.append(msg) + messages.extend(prev_msgs) + # Append the new input. + # Reponses API supports simple text inputs without chat format. + if isinstance(request.input, str): + messages.append(get_user_message(request.input)) + else: + if prev_response is not None: + prev_outputs = copy(prev_response.output) + else: + prev_outputs = [] + for response_msg in request.input: + messages.append( + parse_response_input(response_msg, prev_outputs)) + # User passes in a a tool call request and its output. We need + # to add the tool call request to prev_outputs so that the + # parse_response_input can find the tool call request when + # parsing the tool call output. + if isinstance(response_msg, ResponseFunctionToolCall): + prev_outputs.append(response_msg) + return messages + async def _run_background_request( self, request: ResponsesRequest, From 4be02a37767f05a3fd27d66435d5cebea7a9bfe8 Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Thu, 7 Aug 2025 12:07:54 +0800 Subject: [PATCH 050/932] [Bugfix] EPLB load statistics problem (#22167) Signed-off-by: ycyaw66 <497410282@qq.com> Signed-off-by: David Chen <530634352@qq.com> Co-authored-by: ycyaw66 <497410282@qq.com> --- vllm/distributed/eplb/eplb_state.py | 50 +++++++++---------- vllm/model_executor/layers/fused_moe/layer.py | 17 +------ 2 files changed, 26 insertions(+), 41 deletions(-) diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index f64b516b0d..c415d409f7 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -32,7 +32,7 @@ from dataclasses import dataclass from typing import Optional, Union import torch -from torch.distributed import ProcessGroup, all_gather, all_reduce +from torch.distributed import ProcessGroup, all_reduce from vllm.config import ParallelConfig from vllm.distributed.parallel_state import (get_ep_group, get_node_count, @@ -112,13 +112,21 @@ class EplbState: Expert load during this forward pass. We use the token count each expert processes as the load. - Shape: (num_moe_layers, num_local_physical_experts) + Shape: (num_moe_layers, num_physical_experts) """ expert_load_window: torch.Tensor """ A sliding window of expert load. - Shape: (window_size, num_moe_layers, num_local_physical_experts) + Shape: (window_size, num_moe_layers, num_physical_experts) + + NOTE: The expert_load_view now records load for all physical experts + rather than just local experts. This ensures consistent load statistics + across different dispatch methods (naive all-to-all, DeepEP, pplx-kernels). + The recorded load will be multiplied by dp_size when using naive all-to-all + due to each DP rank contributing the same token set to the calculation. + See: + https://github.com/vllm-project/vllm/pull/22167#pullrequestreview-3086143856 """ expert_load_window_step: int = 0 """ @@ -232,14 +240,14 @@ class EplbState: ).contiguous() expert_load_pass = torch.zeros( - (model.num_moe_layers, model.num_local_physical_experts), + (model.num_moe_layers, model.num_physical_experts), dtype=torch.int32, device=device, ) expert_load_window_size = parallel_config.eplb_window_size expert_load_window = torch.zeros( (expert_load_window_size, model.num_moe_layers, - model.num_local_physical_experts), + model.num_physical_experts), dtype=torch.int32, device=device, ) @@ -353,18 +361,18 @@ class EplbState: self.expert_load_pass.zero_() if log_stats: - # `num_tokens`: (num_moe_layers,) - num_tokens = self.expert_load_pass.sum(dim=-1) + # total_expert_load_pass: (num_moe_layers, num_physical_experts) + total_expert_load_pass = self.expert_load_pass.clone() # Collect load metrics from all ranks ep_group = get_ep_group().device_group assert ep_group is not None - num_tokens_list = [ - torch.empty_like(num_tokens) for _ in range(ep_group.size()) - ] - all_gather(num_tokens_list, num_tokens, group=ep_group) - # Stack to get (num_ranks, num_moe_layers) - num_tokens_per_rank = torch.stack(num_tokens_list).float() + all_reduce(total_expert_load_pass, group=ep_group) + + # num_tokens_per_rank: (num_moe_layers, num_ranks) + num_tokens_per_rank = total_expert_load_pass.reshape( + total_expert_load_pass.shape[0], ep_group.size(), + -1).sum(dim=-1).float() # Compute balancedness ratio: # for each layer: @@ -426,17 +434,7 @@ class EplbState: "(profile)" if is_profile else "") if global_expert_load is None: - # This mapping is only used here, so we do not store it in the state - physical_expert_start = ep_rank * model.num_local_physical_experts - physical_expert_end = (physical_expert_start + - model.num_local_physical_experts) - # (num_moe_layers, num_local_physical_experts) - local_physical_to_logical_map = self.physical_to_logical_map[ - :, - physical_expert_start:physical_expert_end, - ] - - # Map the local physical expert load to global logical experts + # Map the physical expert load to global logical experts logical_expert_load_window = torch.zeros( self.expert_load_window_size, model.num_moe_layers, @@ -446,7 +444,7 @@ class EplbState: ) logical_expert_load_window.scatter_add_( dim=-1, - index=local_physical_to_logical_map.unsqueeze(0).expand_as( + index=self.physical_to_logical_map.unsqueeze(0).expand_as( self.expert_load_window).long(), src=self.expert_load_window, ) @@ -618,4 +616,4 @@ def _node_count_with_rank_mapping( if is_same_node and node_assignment[other_rank] == 0: node_assignment[other_rank] = next_node_id - return next_node_id + return next_node_id \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index a4a6157fa4..72c2bc9a3d 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1430,22 +1430,9 @@ class FusedMoE(torch.nn.Module): # to the modular kernel, we can move this logic there # to achieve better efficiency. - # `expert_load_view`: (num_logical_experts,) + # `expert_load_view`: (num_physical_experts,) - # Mask out non-local experts - if expert_map is not None: - topk_ids_local = expert_map[topk_ids] - topk_ids_flatten = topk_ids_local.flatten() - else: - topk_ids_flatten = topk_ids.flatten() - - # Should be equivalent to: - # ``` - # topk_ids_masked = topk_ids_local[topk_ids_local >= 0] - # expert_load_view += topk_ids_masked.bincount( - # minlength=expert_load_view.shape[0]) - # ``` - # We use `scatter_add_` since `bincount` cannot be compiled + topk_ids_flatten = topk_ids.flatten() # Performance optimization: # `masked_fill` is significantly faster than `masked_select` From 2a4c825523d5715068bf3ec373f662e113c66f45 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 7 Aug 2025 14:05:03 +0800 Subject: [PATCH 051/932] [CI] Skip the pooling models that do not support transformers v4.55 (#22411) Signed-off-by: wang.yuqi --- tests/models/language/pooling/test_embedding.py | 5 ++++- tests/models/language/pooling/test_gte.py | 9 +++++++++ tests/models/language/pooling/test_reward.py | 4 ++++ tests/models/utils.py | 11 +++++++++++ 4 files changed, 28 insertions(+), 1 deletion(-) diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index 51283dc630..2dd35c4151 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -7,7 +7,7 @@ import pytest from vllm.config import PoolerConfig from vllm.platforms import current_platform -from ...utils import check_embeddings_close +from ...utils import check_embeddings_close, check_transformers_version @pytest.fixture(autouse=True) @@ -56,6 +56,9 @@ def test_models( model, monkeypatch, ) -> None: + if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": + check_transformers_version(model, max_transformers_version="4.53.2") + if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm(): # ROCm Triton FA does not currently support sliding window attention # switch to use ROCm CK FA backend diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 6d2eff7099..48a0cd64fe 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -4,6 +4,7 @@ from typing import Any import pytest +from ...utils import check_transformers_version from .embed_utils import EmbedModelInfo, correctness_test_embed_models from .mteb_utils import mteb_test_embed_models @@ -60,6 +61,10 @@ MODELS = [ @pytest.mark.parametrize("model_info", MODELS) def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None: + if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": + check_transformers_version(model_info.name, + max_transformers_version="4.53.2") + vllm_extra_kwargs: dict[str, Any] = {} if model_info.architecture == "GteNewModel": vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]} @@ -72,6 +77,10 @@ def test_embed_models_mteb(hf_runner, vllm_runner, def test_embed_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts) -> None: + if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": + check_transformers_version(model_info.name, + max_transformers_version="4.53.2") + vllm_extra_kwargs: dict[str, Any] = {} if model_info.architecture == "GteNewModel": vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]} diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py index 7add1d975c..beafa0aed9 100644 --- a/tests/models/language/pooling/test_reward.py +++ b/tests/models/language/pooling/test_reward.py @@ -10,6 +10,7 @@ from transformers import AutoModel from vllm.platforms import current_platform from ....conftest import HfRunner +from ...utils import check_transformers_version @pytest.fixture(autouse=True) @@ -86,6 +87,9 @@ def test_prm_models( dtype: str, monkeypatch, ) -> None: + check_transformers_version("Qwen/Qwen2.5-Math-PRM-7B", + max_transformers_version="4.53.2") + if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0": pytest.skip("CPU only supports V1") diff --git a/tests/models/utils.py b/tests/models/utils.py index 1513db5220..4657df60b1 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -412,3 +412,14 @@ def dummy_hf_overrides( }) return hf_config + + +def check_transformers_version(model: str, + min_transformers_version: Optional[str] = None, + max_transformers_version: Optional[str] = None): + from .registry import _HfExamplesInfo + + return _HfExamplesInfo(model, + min_transformers_version=min_transformers_version, + max_transformers_version=max_transformers_version + ).check_transformers_version(on_fail="skip") From 4d4297e8fe96d64be0a114636512fbbe1e5ee0d6 Mon Sep 17 00:00:00 2001 From: lkchen Date: Wed, 6 Aug 2025 23:05:07 -0700 Subject: [PATCH 052/932] [Bench] Split serve.py:main into async/async versions (#22405) Signed-off-by: Linkun --- vllm/benchmarks/serve.py | 102 ++++++++++++++++++++------------------- 1 file changed, 53 insertions(+), 49 deletions(-) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index ca8d218581..6d52b51a9f 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -948,7 +948,10 @@ def add_cli_args(parser: argparse.ArgumentParser): ) -def main(args: argparse.Namespace): +def main(args: argparse.Namespace) -> dict[str, Any]: + return asyncio.run(main_async(args)) + +async def main_async(args: argparse.Namespace) -> dict[str, Any]: print(args) random.seed(args.seed) np.random.seed(args.seed) @@ -1025,8 +1028,7 @@ def main(args: argparse.Namespace): gc.collect() gc.freeze() - benchmark_result = asyncio.run( - benchmark( + benchmark_result = await benchmark( endpoint_type=args.endpoint_type, api_url=api_url, base_url=base_url, @@ -1052,62 +1054,62 @@ def main(args: argparse.Namespace): ramp_up_start_rps=args.ramp_up_start_rps, ramp_up_end_rps=args.ramp_up_end_rps, ready_check_timeout_sec=args.ready_check_timeout_sec, - )) + ) # Save config and results to json - if args.save_result or args.append_result: - result_json: dict[str, Any] = {} + result_json: dict[str, Any] = {} - # Setup - current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") - result_json["date"] = current_dt - result_json["endpoint_type"] = args.endpoint_type - result_json["label"] = label - result_json["model_id"] = model_id - result_json["tokenizer_id"] = tokenizer_id - result_json["num_prompts"] = args.num_prompts + # Setup + current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") + result_json["date"] = current_dt + result_json["endpoint_type"] = args.endpoint_type + result_json["label"] = label + result_json["model_id"] = model_id + result_json["tokenizer_id"] = tokenizer_id + result_json["num_prompts"] = args.num_prompts - # Metadata - if args.metadata: - for item in args.metadata: - if "=" in item: - kvstring = item.split("=") - result_json[kvstring[0].strip()] = kvstring[1].strip() - else: - raise ValueError( - "Invalid metadata format. Please use KEY=VALUE format." - ) + # Metadata + if args.metadata: + for item in args.metadata: + if "=" in item: + kvstring = item.split("=") + result_json[kvstring[0].strip()] = kvstring[1].strip() + else: + raise ValueError( + "Invalid metadata format. Please use KEY=VALUE format." + ) - # Traffic - result_json["request_rate"] = (args.request_rate if args.request_rate - < float("inf") else "inf") - result_json["burstiness"] = args.burstiness - result_json["max_concurrency"] = args.max_concurrency + # Traffic + result_json["request_rate"] = (args.request_rate if args.request_rate + < float("inf") else "inf") + result_json["burstiness"] = args.burstiness + result_json["max_concurrency"] = args.max_concurrency - if args.ramp_up_strategy is not None: - result_json["ramp_up_strategy"] = args.ramp_up_strategy - result_json["ramp_up_start_rps"] = args.ramp_up_start_rps - result_json["ramp_up_end_rps"] = args.ramp_up_end_rps + if args.ramp_up_strategy is not None: + result_json["ramp_up_strategy"] = args.ramp_up_strategy + result_json["ramp_up_start_rps"] = args.ramp_up_start_rps + result_json["ramp_up_end_rps"] = args.ramp_up_end_rps - # Merge with benchmark result - result_json = {**result_json, **benchmark_result} + # Merge with benchmark result + result_json = {**result_json, **benchmark_result} - if not args.save_detailed: - # Remove fields with too many data points - for field in [ - "input_lens", - "output_lens", - "ttfts", - "itls", - "generated_texts", - "errors", - ]: - if field in result_json: - del result_json[field] - if field in benchmark_result: - del benchmark_result[field] + if not args.save_detailed: + # Remove fields with too many data points + for field in [ + "input_lens", + "output_lens", + "ttfts", + "itls", + "generated_texts", + "errors", + ]: + if field in result_json: + del result_json[field] + if field in benchmark_result: + del benchmark_result[field] # Save to file + if args.save_result or args.append_result: base_model_id = model_id.split("/")[-1] max_concurrency_str = (f"-concurrency{args.max_concurrency}" if args.max_concurrency is not None else "") @@ -1129,3 +1131,5 @@ def main(args: argparse.Namespace): outfile.write("\n") json.dump(result_json, outfile) save_to_pytorch_benchmark_format(args, result_json, file_name) + + return result_json \ No newline at end of file From cbc8457b2663e66beb2dedb20f3f0728b82ae603 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Thu, 7 Aug 2025 14:05:24 +0800 Subject: [PATCH 053/932] [Model] Switch to Fused RMS norm in Qwen2.5_VL model. (#22184) Signed-off-by: kf Signed-off-by: tjtanaavllm Signed-off-by: vllmellm Co-authored-by: kf --- vllm/model_executor/models/qwen2_5_vl.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 45fb7f9580..79c5c77f6d 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -396,13 +396,13 @@ class Qwen2_5_VisionBlock(nn.Module): max_seqlen: Optional[int] = None, # Only used for Flash Attention seqlens: Optional[list[int]] = None, # Only used for xFormers ) -> torch.Tensor: - x = x + self.attn(self.norm1(x), - cu_seqlens=cu_seqlens, - rotary_pos_emb=rotary_pos_emb, - max_seqlen=max_seqlen, - seqlens=seqlens) - - x = x + self.mlp(self.norm2(x)) + x_attn = self.attn(self.norm1(x), + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + max_seqlen=max_seqlen, + seqlens=seqlens) + x_fused_norm, residual = self.norm2(x, residual=x_attn) + x = residual + self.mlp(x_fused_norm) return x From 370661856bcfc4cdc9a88580cb70d66b7ac9fc7c Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Thu, 7 Aug 2025 08:06:00 +0200 Subject: [PATCH 054/932] [Frontend] Update OpenAI error response to upstream format (#22099) Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> --- .../entrypoints/openai/test_classification.py | 5 +- .../entrypoints/openai/test_lora_resolvers.py | 8 +-- .../entrypoints/openai/test_serving_models.py | 16 +++--- .../openai/test_transcription_validation.py | 17 ++++--- .../openai/test_translation_validation.py | 5 +- vllm/entrypoints/openai/api_server.py | 50 ++++++++++--------- vllm/entrypoints/openai/protocol.py | 7 ++- vllm/entrypoints/openai/run_batch.py | 2 +- vllm/entrypoints/openai/serving_engine.py | 21 ++++---- vllm/entrypoints/openai/serving_models.py | 9 ++-- 10 files changed, 73 insertions(+), 67 deletions(-) diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py index bcf127307f..886267c211 100644 --- a/tests/entrypoints/openai/test_classification.py +++ b/tests/entrypoints/openai/test_classification.py @@ -121,8 +121,7 @@ def test_invalid_truncate_prompt_tokens_error(server: RemoteOpenAIServer, error = classification_response.json() assert classification_response.status_code == 400 - assert error["object"] == "error" - assert "truncate_prompt_tokens" in error["message"] + assert "truncate_prompt_tokens" in error["error"]["message"] @pytest.mark.parametrize("model_name", [MODEL_NAME]) @@ -137,7 +136,7 @@ def test_empty_input_error(server: RemoteOpenAIServer, model_name: str): error = classification_response.json() assert classification_response.status_code == 400 - assert error["object"] == "error" + assert "error" in error @pytest.mark.parametrize("model_name", [MODEL_NAME]) diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index d4afdf7751..f480117258 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -160,8 +160,8 @@ async def test_serving_completion_resolver_not_found(mock_serving_setup, mock_engine.generate.assert_not_called() assert isinstance(response, ErrorResponse) - assert response.code == HTTPStatus.NOT_FOUND.value - assert non_existent_model in response.message + assert response.error.code == HTTPStatus.NOT_FOUND.value + assert non_existent_model in response.error.message @pytest.mark.asyncio @@ -190,8 +190,8 @@ async def test_serving_completion_resolver_add_lora_fails( # Assert the correct error response assert isinstance(response, ErrorResponse) - assert response.code == HTTPStatus.BAD_REQUEST.value - assert invalid_model in response.message + assert response.error.code == HTTPStatus.BAD_REQUEST.value + assert invalid_model in response.error.message @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index c3b458d717..bc6a0341f5 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -66,8 +66,8 @@ async def test_load_lora_adapter_missing_fields(): request = LoadLoRAAdapterRequest(lora_name="", lora_path="") response = await serving_models.load_lora_adapter(request) assert isinstance(response, ErrorResponse) - assert response.type == "InvalidUserInput" - assert response.code == HTTPStatus.BAD_REQUEST + assert response.error.type == "InvalidUserInput" + assert response.error.code == HTTPStatus.BAD_REQUEST @pytest.mark.asyncio @@ -84,8 +84,8 @@ async def test_load_lora_adapter_duplicate(): lora_path="/path/to/adapter1") response = await serving_models.load_lora_adapter(request) assert isinstance(response, ErrorResponse) - assert response.type == "InvalidUserInput" - assert response.code == HTTPStatus.BAD_REQUEST + assert response.error.type == "InvalidUserInput" + assert response.error.code == HTTPStatus.BAD_REQUEST assert len(serving_models.lora_requests) == 1 @@ -110,8 +110,8 @@ async def test_unload_lora_adapter_missing_fields(): request = UnloadLoRAAdapterRequest(lora_name="", lora_int_id=None) response = await serving_models.unload_lora_adapter(request) assert isinstance(response, ErrorResponse) - assert response.type == "InvalidUserInput" - assert response.code == HTTPStatus.BAD_REQUEST + assert response.error.type == "InvalidUserInput" + assert response.error.code == HTTPStatus.BAD_REQUEST @pytest.mark.asyncio @@ -120,5 +120,5 @@ async def test_unload_lora_adapter_not_found(): request = UnloadLoRAAdapterRequest(lora_name="nonexistent_adapter") response = await serving_models.unload_lora_adapter(request) assert isinstance(response, ErrorResponse) - assert response.type == "NotFoundError" - assert response.code == HTTPStatus.NOT_FOUND + assert response.error.type == "NotFoundError" + assert response.error.code == HTTPStatus.NOT_FOUND diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index a8e2eb40b1..28fd02171b 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -116,8 +116,10 @@ async def test_non_asr_model(winning_call): file=winning_call, language="en", temperature=0.0) - assert res.code == 400 and not res.text - assert res.message == "The model does not support Transcriptions API" + err = res.error + assert err["code"] == 400 and not res.text + assert err[ + "message"] == "The model does not support Transcriptions API" @pytest.mark.asyncio @@ -133,12 +135,15 @@ async def test_completion_endpoints(): "role": "system", "content": "You are a helpful assistant." }]) - assert res.code == 400 - assert res.message == "The model does not support Chat Completions API" + err = res.error + assert err["code"] == 400 + assert err[ + "message"] == "The model does not support Chat Completions API" res = await client.completions.create(model=model_name, prompt="Hello") - assert res.code == 400 - assert res.message == "The model does not support Completions API" + err = res.error + assert err["code"] == 400 + assert err["message"] == "The model does not support Completions API" @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py index 79e769e3a1..bfa9bdef1c 100644 --- a/tests/entrypoints/openai/test_translation_validation.py +++ b/tests/entrypoints/openai/test_translation_validation.py @@ -73,8 +73,9 @@ async def test_non_asr_model(foscolo): res = await client.audio.translations.create(model=model_name, file=foscolo, temperature=0.0) - assert res.code == 400 and not res.text - assert res.message == "The model does not support Translations API" + err = res.error + assert err["code"] == 400 and not res.text + assert err["message"] == "The model does not support Translations API" @pytest.mark.asyncio diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index f6f83223a1..c695ea8b5a 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -62,7 +62,8 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DetokenizeRequest, DetokenizeResponse, EmbeddingRequest, - EmbeddingResponse, ErrorResponse, + EmbeddingResponse, ErrorInfo, + ErrorResponse, LoadLoRAAdapterRequest, PoolingRequest, PoolingResponse, RerankRequest, RerankResponse, @@ -506,7 +507,7 @@ async def tokenize(request: TokenizeRequest, raw_request: Request): if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, TokenizeResponse): return JSONResponse(content=generator.model_dump()) @@ -540,7 +541,7 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request): if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, DetokenizeResponse): return JSONResponse(content=generator.model_dump()) @@ -556,7 +557,7 @@ def maybe_register_tokenizer_info_endpoint(args): """Get comprehensive tokenizer information.""" result = await tokenization(raw_request).get_tokenizer_info() return JSONResponse(content=result.model_dump(), - status_code=result.code if isinstance( + status_code=result.error.code if isinstance( result, ErrorResponse) else 200) @@ -603,7 +604,7 @@ async def create_responses(request: ResponsesRequest, raw_request: Request): if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, ResponsesResponse): return JSONResponse(content=generator.model_dump()) return StreamingResponse(content=generator, media_type="text/event-stream") @@ -620,7 +621,7 @@ async def retrieve_responses(response_id: str, raw_request: Request): if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), - status_code=response.code) + status_code=response.error.code) return JSONResponse(content=response.model_dump()) @@ -635,7 +636,7 @@ async def cancel_responses(response_id: str, raw_request: Request): if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), - status_code=response.code) + status_code=response.error.code) return JSONResponse(content=response.model_dump()) @@ -670,7 +671,7 @@ async def create_chat_completion(request: ChatCompletionRequest, if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, ChatCompletionResponse): return JSONResponse(content=generator.model_dump()) @@ -715,7 +716,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request): if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, CompletionResponse): return JSONResponse(content=generator.model_dump()) @@ -744,7 +745,7 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, EmbeddingResponse): return JSONResponse(content=generator.model_dump()) @@ -772,7 +773,7 @@ async def create_pooling(request: PoolingRequest, raw_request: Request): generator = await handler.create_pooling(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, PoolingResponse): return JSONResponse(content=generator.model_dump()) @@ -792,7 +793,7 @@ async def create_classify(request: ClassificationRequest, generator = await handler.create_classify(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, ClassificationResponse): return JSONResponse(content=generator.model_dump()) @@ -821,7 +822,7 @@ async def create_score(request: ScoreRequest, raw_request: Request): generator = await handler.create_score(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, ScoreResponse): return JSONResponse(content=generator.model_dump()) @@ -881,7 +882,7 @@ async def create_transcriptions(raw_request: Request, if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, TranscriptionResponse): return JSONResponse(content=generator.model_dump()) @@ -922,7 +923,7 @@ async def create_translations(request: Annotated[TranslationRequest, if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, TranslationResponse): return JSONResponse(content=generator.model_dump()) @@ -950,7 +951,7 @@ async def do_rerank(request: RerankRequest, raw_request: Request): generator = await handler.do_rerank(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + status_code=generator.error.code) elif isinstance(generator, RerankResponse): return JSONResponse(content=generator.model_dump()) @@ -1175,7 +1176,7 @@ async def invocations(raw_request: Request): msg = ("Cannot find suitable handler for request. " f"Expected one of: {type_names}") res = base(raw_request).create_error_response(message=msg) - return JSONResponse(content=res.model_dump(), status_code=res.code) + return JSONResponse(content=res.model_dump(), status_code=res.error.code) if envs.VLLM_TORCH_PROFILER_DIR: @@ -1211,7 +1212,7 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: response = await handler.load_lora_adapter(request) if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), - status_code=response.code) + status_code=response.error.code) return Response(status_code=200, content=response) @@ -1223,7 +1224,7 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: response = await handler.unload_lora_adapter(request) if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), - status_code=response.code) + status_code=response.error.code) return Response(status_code=200, content=response) @@ -1502,9 +1503,10 @@ def build_app(args: Namespace) -> FastAPI: @app.exception_handler(HTTPException) async def http_exception_handler(_: Request, exc: HTTPException): - err = ErrorResponse(message=exc.detail, + err = ErrorResponse( + error=ErrorInfo(message=exc.detail, type=HTTPStatus(exc.status_code).phrase, - code=exc.status_code) + code=exc.status_code)) return JSONResponse(err.model_dump(), status_code=exc.status_code) @app.exception_handler(RequestValidationError) @@ -1518,9 +1520,9 @@ def build_app(args: Namespace) -> FastAPI: else: message = exc_str - err = ErrorResponse(message=message, - type=HTTPStatus.BAD_REQUEST.phrase, - code=HTTPStatus.BAD_REQUEST) + err = ErrorResponse(error=ErrorInfo(message=message, + type=HTTPStatus.BAD_REQUEST.phrase, + code=HTTPStatus.BAD_REQUEST)) return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 421927d61b..ea2cf57563 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -78,14 +78,17 @@ class OpenAIBaseModel(BaseModel): return result -class ErrorResponse(OpenAIBaseModel): - object: str = "error" +class ErrorInfo(OpenAIBaseModel): message: str type: str param: Optional[str] = None code: int +class ErrorResponse(OpenAIBaseModel): + error: ErrorInfo + + class ModelPermission(OpenAIBaseModel): id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}") object: str = "model_permission" diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index d146ad485d..a10d57456b 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -302,7 +302,7 @@ async def run_request(serving_engine_func: Callable, id=f"vllm-{random_uuid()}", custom_id=request.custom_id, response=BatchResponseData( - status_code=response.code, + status_code=response.error.code, request_id=f"vllm-batch-{random_uuid()}"), error=response, ) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 822f186840..efd2f20299 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -47,10 +47,10 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, EmbeddingChatRequest, EmbeddingCompletionRequest, EmbeddingRequest, - EmbeddingResponse, ErrorResponse, - PoolingResponse, RerankRequest, - ResponsesRequest, ScoreRequest, - ScoreResponse, + EmbeddingResponse, ErrorInfo, + ErrorResponse, PoolingResponse, + RerankRequest, ResponsesRequest, + ScoreRequest, ScoreResponse, TokenizeChatRequest, TokenizeCompletionRequest, TokenizeResponse, @@ -412,21 +412,18 @@ class OpenAIServing: message: str, err_type: str = "BadRequestError", status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse: - return ErrorResponse(message=message, - type=err_type, - code=status_code.value) + return ErrorResponse(error=ErrorInfo( + message=message, type=err_type, code=status_code.value)) def create_streaming_error_response( self, message: str, err_type: str = "BadRequestError", status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> str: - json_str = json.dumps({ - "error": + json_str = json.dumps( self.create_error_response(message=message, err_type=err_type, - status_code=status_code).model_dump() - }) + status_code=status_code).model_dump()) return json_str async def _check_model( @@ -445,7 +442,7 @@ class OpenAIServing: if isinstance(load_result, LoRARequest): return None if isinstance(load_result, ErrorResponse) and \ - load_result.code == HTTPStatus.BAD_REQUEST.value: + load_result.error.code == HTTPStatus.BAD_REQUEST.value: error_response = load_result return error_response or self.create_error_response( diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 27614fcb41..a4efa0815b 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -9,7 +9,7 @@ from typing import Optional, Union from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient -from vllm.entrypoints.openai.protocol import (ErrorResponse, +from vllm.entrypoints.openai.protocol import (ErrorInfo, ErrorResponse, LoadLoRAAdapterRequest, ModelCard, ModelList, ModelPermission, @@ -82,7 +82,7 @@ class OpenAIServingModels: load_result = await self.load_lora_adapter( request=load_request, base_model_name=lora.base_model_name) if isinstance(load_result, ErrorResponse): - raise ValueError(load_result.message) + raise ValueError(load_result.error.message) def is_base_model(self, model_name) -> bool: return any(model.name == model_name for model in self.base_model_paths) @@ -284,6 +284,5 @@ def create_error_response( message: str, err_type: str = "BadRequestError", status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse: - return ErrorResponse(message=message, - type=err_type, - code=status_code.value) + return ErrorResponse(error=ErrorInfo( + message=message, type=err_type, code=status_code.value)) From 82216dc21f777584bcf53ab1fe4936390c1737bf Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Wed, 6 Aug 2025 23:06:20 -0700 Subject: [PATCH 055/932] [Misc] Support routing logic simulation (#21990) Signed-off-by: Ming Yang Co-authored-by: Tyler Michael Smith Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- tests/test_routing_simulator.py | 171 +++++++++++ vllm/envs.py | 9 + vllm/model_executor/layers/fused_moe/layer.py | 12 + .../layers/fused_moe/routing_simulator.py | 289 ++++++++++++++++++ 4 files changed, 481 insertions(+) create mode 100644 tests/test_routing_simulator.py create mode 100644 vllm/model_executor/layers/fused_moe/routing_simulator.py diff --git a/tests/test_routing_simulator.py b/tests/test_routing_simulator.py new file mode 100644 index 0000000000..8324b225a8 --- /dev/null +++ b/tests/test_routing_simulator.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Test script for the token-to-expert routing simulator. + +This script demonstrates how to use the routing simulator to test +different routing strategies and analyze their performance, including +integration tests with FusedMoE layer. +""" + +import pytest +import torch + +from vllm.model_executor.layers.fused_moe.routing_simulator import ( + DistributionBasedRouting, RoutingSimulator) + + +@pytest.fixture +def device(): + """Fixture to provide the appropriate device for testing.""" + return torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +@pytest.mark.parametrize("num_tokens", [1, 16, 256]) +@pytest.mark.parametrize("hidden_size", [64, 1024]) +@pytest.mark.parametrize("num_experts", [16, 128]) +@pytest.mark.parametrize("top_k", [1, 4]) +def test_basic_functionality( + num_tokens: int, + hidden_size: int, + num_experts: int, + top_k: int, + device, +): + """Test basic functionality of the routing simulator.""" + # Test each routing strategy + strategies = RoutingSimulator.get_available_strategies() + + hidden_states = torch.randn(num_tokens, hidden_size, device=device) + router_logits = torch.randn(num_tokens, num_experts, device=device) + + for strategy in strategies: + # Simulate routing + topk_weights, topk_ids = RoutingSimulator.simulate_routing( + hidden_states=hidden_states, + router_logits=router_logits, + strategy_name=strategy, + top_k=top_k, + ) + + # Check output shapes + assert topk_weights.shape == ( + num_tokens, + top_k, + ), f"Wrong weights shape for {strategy}" + assert topk_ids.shape == ( + num_tokens, + top_k, + ), f"Wrong ids shape for {strategy}" + + # Check that expert IDs are valid + assert (topk_ids.min() + >= 0), f"Invalid expert ID (negative) for {strategy}" + assert (topk_ids.max() + < num_experts), f"Invalid expert ID (too large) for {strategy}" + + +def test_routing_strategy_integration(monkeypatch, device): + """Test that the routing strategy environment variable works with + FusedMoE.""" + pytest.importorskip("vllm.model_executor.layers.fused_moe.layer") + + import vllm.envs as envs + from vllm.model_executor.layers.fused_moe.layer import FusedMoE + + # Test parameters + num_tokens = 32 + hidden_size = 16 + num_experts = 4 + top_k = 2 + + # Create test data + hidden_states = torch.randn(num_tokens, hidden_size, device=device) + router_logits = torch.randn(num_tokens, num_experts, device=device) + + # Test different routing strategies + strategies = RoutingSimulator.get_available_strategies() + + for strategy in strategies: + # Set environment variable + env_name = "VLLM_MOE_ROUTING_SIMULATION_STRATEGY" + monkeypatch.setenv(env_name, strategy) + + # Force reload of environment variable + envs.environment_variables[env_name] = lambda s=strategy: s + + # Test the select_experts method + topk_weights, topk_ids = FusedMoE.select_experts( + hidden_states=hidden_states, + router_logits=router_logits, + top_k=top_k, + use_grouped_topk=False, + renormalize=True, + indices_type=torch.long) + + # Verify output shapes + assert topk_weights.shape == ( + num_tokens, top_k), f"Wrong weights shape for {strategy}" + assert topk_ids.shape == (num_tokens, + top_k), f"Wrong ids shape for {strategy}" + + # Verify expert IDs are valid + assert topk_ids.min( + ) >= 0, f"Invalid expert ID (negative) for {strategy}" + assert topk_ids.max( + ) < num_experts, f"Invalid expert ID (too large) for {strategy}" + + +def test_distribution_based_routing_with_custom_strategy(): + """Test registering and using DistributionBasedRouting with custom + parameters.""" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Register custom distribution-based strategy + custom_strategy = DistributionBasedRouting(distribution="normal", + mean=2.0, + std=0.5) + RoutingSimulator.register_strategy("custom_normal", custom_strategy) + + # Test data + num_tokens = 60 + hidden_size = 48 + num_experts = 6 + top_k = 3 + + hidden_states = torch.randn(num_tokens, hidden_size, device=device) + router_logits = torch.randn(num_tokens, num_experts, device=device) + + # Use the custom strategy + topk_weights, topk_ids = RoutingSimulator.simulate_routing( + hidden_states=hidden_states, + router_logits=router_logits, + strategy_name="custom_normal", + top_k=top_k) + + # Check output shapes + assert topk_weights.shape == (num_tokens, top_k) + assert topk_ids.shape == (num_tokens, top_k) + + # Check that expert IDs are valid + assert topk_ids.min() >= 0 + assert topk_ids.max() < num_experts + + +def test_instance_compatibility(): + """Test that static methods work correctly.""" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Test static method directly + hidden_states = torch.randn(10, 8, device=device) + router_logits = torch.randn(10, 4, device=device) + + topk_weights, topk_ids = RoutingSimulator.simulate_routing( + hidden_states=hidden_states, + router_logits=router_logits, + strategy_name="uniform_random", + top_k=2) + + assert topk_weights.shape == (10, 2) + assert topk_ids.shape == (10, 2) diff --git a/vllm/envs.py b/vllm/envs.py index d9ebf59c1a..f6c6d7e7ed 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -989,6 +989,15 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE": lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")), + # MoE routing strategy selector. + # See `RoutingSimulator.get_available_strategies()` # for available + # strategies. + # Cutstom routing strategies can be registered by + # RoutingSimulator.register_strategy() + # Note: custom strategies may not produce correct model outputs + "VLLM_MOE_ROUTING_SIMULATION_STRATEGY": + lambda: os.environ.get("VLLM_MOE_ROUTING_SIMULATION_STRATEGY", "").lower(), + # Regex timeout for use by the vLLM tool parsing plugins. "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS": lambda: int(os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")), diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 72c2bc9a3d..76cedb3ed3 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -28,6 +28,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize) from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( is_rocm_aiter_moe_enabled) +from vllm.model_executor.layers.fused_moe.routing_simulator import ( + RoutingSimulator) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.utils import set_weight_attrs @@ -1362,6 +1364,16 @@ class FusedMoE(torch.nn.Module): """ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk + # Check if we should use a routing simulation strategy + routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY + if routing_strategy != "": + return RoutingSimulator.simulate_routing( + hidden_states=hidden_states, + router_logits=router_logits, + strategy_name=routing_strategy, + top_k=top_k, + indices_type=indices_type) + # DeepSeekv2 uses grouped_top_k if use_grouped_topk: assert topk_group is not None diff --git a/vllm/model_executor/layers/fused_moe/routing_simulator.py b/vllm/model_executor/layers/fused_moe/routing_simulator.py new file mode 100644 index 0000000000..c8b107f13c --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/routing_simulator.py @@ -0,0 +1,289 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Token-to-Expert Routing Simulator + +This module provides a framework for simulating and testing different +token-to-expert routing strategies for Mixture of Experts (MoE) models. +It supports routing logic customization and includes example implementations +like uniform random routing. +""" + +from abc import ABC, abstractmethod +from typing import Optional + +import torch + + +class RoutingStrategy(ABC): + """Base class for token-to-expert routing strategies.""" + + @abstractmethod + def route_tokens( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + indices_type: Optional[torch.dtype] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Route tokens to experts. + + Args: + hidden_states: Input hidden states [num_tokens, hidden_size] + router_logits: Router logits [num_tokens, num_experts] + top_k: Number of experts to select per token + indices_type: Data type for expert indices + + Returns: + tuple of (topk_weights, topk_ids) + """ + pass + + +class DistributionBasedRouting(RoutingStrategy): + """ + Distribution-based random routing strategy with configurable distributions. + + This routing strategy randomly selects experts for each token based on + different probability distributions. Currently supports uniform and normal + distributions for testing different routing patterns. + """ + + def __init__(self, distribution: str = "uniform", **distribution_params): + """ + Initialize distribution-based routing. + + Args: + distribution: Type of distribution to use for sampling + - "uniform": Uniform distribution (default) + - "normal": Normal/Gaussian distribution + **distribution_params: Parameters specific to the + chosen distribution + For "uniform": No additional parameters needed + For "normal": mean (default: 0.0), std (default: 1.0) + """ + self.distribution = distribution.lower() + self.distribution_params = distribution_params + + # Validate distribution and parameters + self._validate_distribution_params() + + def _validate_distribution_params(self): + """Validate distribution type and parameters.""" + valid_distributions = ["uniform", "normal"] + + if self.distribution not in valid_distributions: + raise ValueError(f"Unsupported distribution: {self.distribution}. " + f"Supported distributions: {valid_distributions}") + + # Set default parameters if not provided + if self.distribution == "normal": + self.distribution_params.setdefault("mean", 0.0) + self.distribution_params.setdefault("std", 1.0) + + def route_tokens( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + indices_type: Optional[torch.dtype] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Randomly select experts for each token using the specified distribution. + + Args: + hidden_states: Input hidden states [num_tokens, hidden_size] + router_logits: Router logits [num_tokens, num_experts] + top_k: Number of experts to select per token + indices_type: Data type for expert indices + + Returns: + tuple of (topk_weights, topk_ids) where: + - topk_weights: Weights based on distribution sampling + - topk_ids: Expert indices sampled from the distribution + """ + num_tokens = hidden_states.shape[0] + num_experts = router_logits.shape[-1] + + if indices_type is None: + indices_type = torch.long + + # Generate expert IDs based on the specified distribution + topk_ids = self._sample_expert_ids(num_tokens, num_experts, top_k, + hidden_states.device, indices_type) + + # Generate weights based on the distribution + topk_weights = self._generate_weights(num_tokens, top_k, + hidden_states.device) + + return topk_weights, topk_ids + + def _sample_expert_ids( + self, + num_tokens: int, + num_experts: int, + top_k: int, + device: torch.device, + indices_type: torch.dtype, + ) -> torch.Tensor: + """Sample expert IDs based on the specified distribution.""" + + if self.distribution == "uniform": + # Uniform random sampling + return torch.randint( + low=0, + high=num_experts, + size=(num_tokens, top_k), + dtype=indices_type, + device=device, + ) + + elif self.distribution == "normal": + # For normal distribution, sample continuous values and map to + # expert IDs + continuous_samples = self._sample_continuous_distribution( + num_tokens, top_k, device) + + # Map continuous samples to expert indices + # Normalize to [0, 1] range and scale to [0, num_experts) + normalized_samples = self._normalize_samples(continuous_samples) + expert_ids = (normalized_samples * num_experts).long() + expert_ids = torch.clamp(expert_ids, 0, num_experts - 1) + + return expert_ids.to(dtype=indices_type) + + else: + raise ValueError(f"Unsupported distribution: {self.distribution}") + + def _sample_continuous_distribution(self, num_tokens: int, top_k: int, + device: torch.device) -> torch.Tensor: + """Sample from continuous distributions.""" + shape = (num_tokens, top_k) + + if self.distribution == "normal": + mean = self.distribution_params["mean"] + std = self.distribution_params["std"] + return torch.normal(mean, std, size=shape, device=device) + + else: + raise ValueError( + f"Unsupported continuous distribution: {self.distribution}") + + def _normalize_samples(self, samples: torch.Tensor) -> torch.Tensor: + """Normalize samples to [0, 1] range.""" + if self.distribution == "normal": + # Use sigmoid to map normal distribution to [0, 1] + return torch.sigmoid(samples) + + else: + raise ValueError(f"Unsupported distribution for normalization: " + f"{self.distribution}") + + def _generate_weights(self, num_tokens: int, top_k: int, + device: torch.device) -> torch.Tensor: + """Generate weights based on the distribution.""" + if self.distribution == "uniform": + # All-ones weights for uniform distribution + return torch.ones( + (num_tokens, top_k), + dtype=torch.float32, + device=device, + ) + + elif self.distribution == "normal": + # For normal distribution, generate weights from the same + # distribution + continuous_weights = self._sample_continuous_distribution( + num_tokens, top_k, device) + # Normalize to positive values and sum to 1 + weights = torch.abs(continuous_weights) + weights = weights / weights.sum(dim=-1, keepdim=True) + return weights + + else: + raise ValueError( + f"Unsupported distribution for weight generation: " + f"{self.distribution}") + + def get_distribution_info(self) -> dict: + """Get information about the current distribution configuration.""" + return { + "distribution": self.distribution, + "parameters": self.distribution_params.copy() + } + + +class RoutingSimulator: + """ + Token-to-Expert Routing Simulator. + + This class provides a framework for testing and comparing different + routing strategies for MoE models. It can simulate routing behavior + and collect statistics for analysis. + """ + + # Class-level registry of routing strategies + _routing_strategies: dict[str, RoutingStrategy] = { + # Basic routing strategies + "uniform_random": + DistributionBasedRouting(distribution="uniform", mean=0.0, std=1.0), + "normal_routing": + DistributionBasedRouting(distribution="normal", mean=0.0, std=1.0), + } + + @classmethod + def register_strategy(cls, name: str, strategy: RoutingStrategy): + """ + Register a custom routing strategy. + + Args: + name: Name of the strategy + strategy: RoutingStrategy instance + """ + cls._routing_strategies[name] = strategy + + @classmethod + def get_available_strategies(cls): + """ + Get list of available routing strategy names. + + Returns: + List of available strategy names + """ + return list(cls._routing_strategies.keys()) + + @staticmethod + def simulate_routing( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + strategy_name: str, + top_k: int, + indices_type: Optional[torch.dtype] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Simulate token-to-expert routing using the specified strategy. + + Args: + hidden_states: Input hidden states [num_tokens, hidden_size] + router_logits: Router logits [num_tokens, num_experts] + strategy_name: Name of the routing strategy to use + top_k: Number of experts to select per token + indices_type: Data type for expert indices + + Returns: + tuple of (topk_weights, topk_ids) + """ + if strategy_name not in RoutingSimulator._routing_strategies: + raise ValueError( + f"Unknown routing strategy: {strategy_name}. " + f"Available strategies: " + f"{list(RoutingSimulator._routing_strategies.keys())}") + + strategy = RoutingSimulator._routing_strategies[strategy_name] + return strategy.route_tokens( + hidden_states=hidden_states, + router_logits=router_logits, + top_k=top_k, + indices_type=indices_type, + ) From 8e8e0b6af189d262bcfdaef6c0cfb94772e86b0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Garc=C3=ADa=20Garc=C3=ADa?= Date: Thu, 7 Aug 2025 10:10:13 +0400 Subject: [PATCH 056/932] feat: Add --enable-log-outputs flag for logging model generations (#20707) Signed-off-by: Adrian Garcia --- tests/test_logger.py | 252 ++++++++++++++++++- vllm/entrypoints/logger.py | 36 ++- vllm/entrypoints/openai/cli_args.py | 9 +- vllm/entrypoints/openai/serving_chat.py | 121 +++++++-- vllm/entrypoints/openai/serving_responses.py | 20 ++ 5 files changed, 412 insertions(+), 26 deletions(-) diff --git a/tests/test_logger.py b/tests/test_logger.py index 8f235f1474..0bfb449cdf 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -10,11 +10,12 @@ from dataclasses import dataclass from json.decoder import JSONDecodeError from tempfile import NamedTemporaryFile from typing import Any -from unittest.mock import patch +from unittest.mock import MagicMock, patch from uuid import uuid4 import pytest +from vllm.entrypoints.logger import RequestLogger from vllm.logger import (_DATE_FORMAT, _FORMAT, _configure_vllm_root_logger, enable_trace_function_call, init_logger) from vllm.logging_utils import NewLineFormatter @@ -228,9 +229,10 @@ def test_prepare_object_to_dump(): list_obj = [1, 2, 3] assert prepare_object_to_dump(list_obj) == '[1, 2, 3]' - dict_obj = {'a': 1, 'b': 'b'} + dict_obj = {"a": 1, "b": "b"} assert prepare_object_to_dump(dict_obj) in [ - "{a: 1, b: 'b'}", "{b: 'b', a: 1}" + "{a: 1, b: 'b'}", + "{b: 'b', a: 1}", ] set_obj = {1, 2, 3} @@ -252,4 +254,246 @@ def test_prepare_object_to_dump(): b: str assert (prepare_object_to_dump(CustomClass( - 1, 'b')) == "CustomClass(a=1, b='b')") + 1, "b")) == "CustomClass(a=1, b='b')") + + +def test_request_logger_log_outputs(): + """Test the new log_outputs functionality.""" + # Create a mock logger to capture log calls + mock_logger = MagicMock() + + with patch("vllm.entrypoints.logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test basic output logging + request_logger.log_outputs( + request_id="test-123", + outputs="Hello, world!", + output_token_ids=[1, 2, 3, 4], + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-123" + assert call_args[3] == "Hello, world!" + assert call_args[4] == [1, 2, 3, 4] + assert call_args[5] == "stop" + + +def test_request_logger_log_outputs_streaming_delta(): + """Test log_outputs with streaming delta mode.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test streaming delta logging + request_logger.log_outputs( + request_id="test-456", + outputs="Hello", + output_token_ids=[1], + finish_reason=None, + is_streaming=True, + delta=True, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-456" + assert call_args[2] == " (streaming delta)" + assert call_args[3] == "Hello" + assert call_args[4] == [1] + assert call_args[5] is None + + +def test_request_logger_log_outputs_streaming_complete(): + """Test log_outputs with streaming complete mode.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test streaming complete logging + request_logger.log_outputs( + request_id="test-789", + outputs="Complete response", + output_token_ids=[1, 2, 3], + finish_reason="length", + is_streaming=True, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-789" + assert call_args[2] == " (streaming complete)" + assert call_args[3] == "Complete response" + assert call_args[4] == [1, 2, 3] + assert call_args[5] == "length" + + +def test_request_logger_log_outputs_with_truncation(): + """Test log_outputs respects max_log_len setting.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.logger.logger", mock_logger): + # Set max_log_len to 10 + request_logger = RequestLogger(max_log_len=10) + + # Test output truncation + long_output = "This is a very long output that should be truncated" + long_token_ids = list(range(20)) # 20 tokens + + request_logger.log_outputs( + request_id="test-truncate", + outputs=long_output, + output_token_ids=long_token_ids, + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args + + # Check that output was truncated to first 10 characters + logged_output = call_args[0][3] + assert logged_output == "This is a " + assert len(logged_output) == 10 + + # Check that token IDs were truncated to first 10 tokens + logged_token_ids = call_args[0][4] + assert logged_token_ids == list(range(10)) + assert len(logged_token_ids) == 10 + + +def test_request_logger_log_outputs_none_values(): + """Test log_outputs handles None values correctly.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test with None output_token_ids + request_logger.log_outputs( + request_id="test-none", + outputs="Test output", + output_token_ids=None, + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-none" + assert call_args[3] == "Test output" + assert call_args[4] is None + assert call_args[5] == "stop" + + +def test_request_logger_log_outputs_empty_output(): + """Test log_outputs handles empty output correctly.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=5) + + # Test with empty output + request_logger.log_outputs( + request_id="test-empty", + outputs="", + output_token_ids=[], + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-empty" + assert call_args[3] == "" + assert call_args[4] == [] + assert call_args[5] == "stop" + + +def test_request_logger_log_outputs_integration(): + """Test that log_outputs can be called alongside log_inputs.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test that both methods can be called without interference + request_logger.log_inputs( + request_id="test-integration", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_embeds=None, + params=None, + lora_request=None, + ) + + request_logger.log_outputs( + request_id="test-integration", + outputs="Test output", + output_token_ids=[4, 5, 6], + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + # Should have been called twice - once for inputs, once for outputs + assert mock_logger.info.call_count == 2 + + # Check that the calls were made with correct patterns + input_call = mock_logger.info.call_args_list[0][0] + output_call = mock_logger.info.call_args_list[1][0] + + assert "Received request %s" in input_call[0] + assert input_call[1] == "test-integration" + + assert "Generated response %s%s" in output_call[0] + assert output_call[1] == "test-integration" + + +def test_streaming_complete_logs_full_text_content(): + """Test that streaming complete logging includes + full accumulated text, not just token count.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test with actual content instead of token count format + full_response = "This is a complete response from streaming" + request_logger.log_outputs( + request_id="test-streaming-full-text", + outputs=full_response, + output_token_ids=None, + finish_reason="streaming_complete", + is_streaming=True, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + + # Verify the logged output is the full text, not a token count format + logged_output = call_args[3] + assert logged_output == full_response + assert "tokens>" not in logged_output + assert "streaming_complete" not in logged_output + + # Verify other parameters + assert call_args[1] == "test-streaming-full-text" + assert call_args[2] == " (streaming complete)" + assert call_args[5] == "streaming_complete" diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py index 06ff3b417f..152d11c84e 100644 --- a/vllm/entrypoints/logger.py +++ b/vllm/entrypoints/logger.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Sequence from typing import Optional, Union import torch @@ -16,8 +17,6 @@ logger = init_logger(__name__) class RequestLogger: def __init__(self, *, max_log_len: Optional[int]) -> None: - super().__init__() - self.max_log_len = max_log_len def log_inputs( @@ -45,3 +44,36 @@ class RequestLogger: "lora_request: %s.", request_id, prompt, params, prompt_token_ids, prompt_embeds.shape if prompt_embeds is not None else None, lora_request) + + def log_outputs( + self, + request_id: str, + outputs: str, + output_token_ids: Optional[Sequence[int]], + finish_reason: Optional[str] = None, + is_streaming: bool = False, + delta: bool = False, + ) -> None: + max_log_len = self.max_log_len + if max_log_len is not None: + if outputs is not None: + outputs = outputs[:max_log_len] + + if output_token_ids is not None: + # Convert to list and apply truncation + output_token_ids = list(output_token_ids)[:max_log_len] + + stream_info = "" + if is_streaming: + stream_info = (" (streaming delta)" + if delta else " (streaming complete)") + + logger.info( + "Generated response %s%s: output: %r, " + "output_token_ids: %s, finish_reason: %s", + request_id, + stream_info, + outputs, + output_token_ids, + finish_reason, + ) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 12318b300c..e89463a03c 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -44,10 +44,10 @@ class LoRAParserAction(argparse.Action): lora_list: list[LoRAModulePath] = [] for item in values: - if item in [None, '']: # Skip if item is None or empty string + if item in [None, ""]: # Skip if item is None or empty string continue - if '=' in item and ',' not in item: # Old format: name=path - name, path = item.split('=') + if "=" in item and "," not in item: # Old format: name=path + name, path = item.split("=") lora_list.append(LoRAModulePath(name, path)) else: # Assume JSON format try: @@ -167,6 +167,9 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" enable_tokenizer_info_endpoint: bool = False """Enable the /get_tokenizer_info endpoint. May expose chat templates and other tokenizer configuration.""" + enable_log_outputs: bool = False + """If set to True, enable logging of model outputs (generations) + in addition to the input logging that is enabled by default.""" @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 6ad0a8ec54..b4231c6d10 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -73,6 +73,7 @@ class OpenAIServingChat(OpenAIServing): tool_parser: Optional[str] = None, enable_prompt_tokens_details: bool = False, enable_force_include_usage: bool = False, + enable_log_outputs: bool = False, ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, @@ -84,6 +85,7 @@ class OpenAIServingChat(OpenAIServing): self.response_role = response_role self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format + self.enable_log_outputs = enable_log_outputs # set up tool use self.enable_auto_tools: bool = enable_auto_tools @@ -489,20 +491,21 @@ class OpenAIServingChat(OpenAIServing): all_previous_token_ids: Optional[list[list[int]]] function_name_returned = [False] * num_choices + # Always track previous_texts for comprehensive output logging + previous_texts = [""] * num_choices + # Only one of these will be used, thus previous_texts and # all_previous_token_ids will not be used twice in the same iteration. if tool_choice_auto or self.reasoning_parser: # These are only required in "auto" tool choice case - previous_texts = [""] * num_choices all_previous_token_ids = [[]] * num_choices # For reasoning parser and tool call all enabled added_content_delta_arr = [False] * num_choices reasoning_end_arr = [False] * num_choices elif request.tool_choice == "required": - previous_texts = [""] * num_choices all_previous_token_ids = None else: - previous_texts, all_previous_token_ids = None, None + all_previous_token_ids = None try: if self.reasoning_parser: @@ -844,6 +847,7 @@ class OpenAIServingChat(OpenAIServing): current_token_ids=current_token_ids, delta_token_ids=output.token_ids, request=request)) + # when only reasoning elif self.reasoning_parser: delta_message = (reasoning_parser. @@ -865,6 +869,10 @@ class OpenAIServingChat(OpenAIServing): assert all_previous_token_ids is not None previous_texts[i] = current_text all_previous_token_ids[i] = current_token_ids + else: + # Update for comprehensive logging even in simple case + assert previous_texts is not None + previous_texts[i] += delta_text # set the previous values for the next iteration previous_num_tokens[i] += len(output.token_ids) @@ -876,6 +884,27 @@ class OpenAIServingChat(OpenAIServing): if delta_message is None: continue + # Log streaming delta if output logging is enabled + if self.enable_log_outputs and self.request_logger: + delta_content = "" + if delta_message.content: + delta_content = delta_message.content + elif delta_message.tool_calls: + delta_content = "".join( + tc.function.arguments + for tc in delta_message.tool_calls + if tc.function and tc.function.arguments) + + if delta_content: + self.request_logger.log_outputs( + request_id=request_id, + outputs=delta_content, + output_token_ids=list(output.token_ids), + finish_reason=output.finish_reason, + is_streaming=True, + delta=True, + ) + if output.finish_reason is None: # Send token-by-token response for each request.n choice_data = ChatCompletionResponseStreamChoice( @@ -994,7 +1023,27 @@ class OpenAIServingChat(OpenAIServing): request_metadata.final_usage_info = UsageInfo( prompt_tokens=num_prompt_tokens, completion_tokens=num_completion_tokens, - total_tokens=num_prompt_tokens + num_completion_tokens) + total_tokens=num_prompt_tokens + num_completion_tokens, + ) + + # Log complete streaming response if output logging is enabled + if self.enable_log_outputs and self.request_logger: + # Log the complete response for each choice + for i in range(num_choices): + full_text = ( + previous_texts[i] + if previous_texts and i < len(previous_texts) else + f"" + ) + self.request_logger.log_outputs( + request_id=request_id, + outputs=full_text, + output_token_ids= + None, # Consider also logging all token IDs + finish_reason="streaming_complete", + is_streaming=True, + delta=False, + ) except Exception as e: # TODO: Use a vllm-specific Validation Error @@ -1121,8 +1170,10 @@ class OpenAIServingChat(OpenAIServing): tool_calls=[ tool_call_class(function=FunctionCall( name=request.tool_choice.function.name, - arguments=content)) - ]) + arguments=content, + )) + ], + ) elif request.tool_choice and request.tool_choice == "required": tool_call_class = MistralToolCall if isinstance( @@ -1209,12 +1260,13 @@ class OpenAIServingChat(OpenAIServing): finish_reason="tool_calls" if auto_tools_called else output.finish_reason if output.finish_reason else "stop", stop_reason=output.stop_reason) + choices.append(choice_data) if request.echo: last_msg_content: Union[str, list[dict[str, str]]] = "" - if conversation and "content" in conversation[-1] and conversation[ - -1].get("role") == role: + if (conversation and "content" in conversation[-1] + and conversation[-1].get("role") == role): last_msg_content = conversation[-1]["content"] or "" if isinstance(last_msg_content, list): last_msg_content = "\n".join(msg['text'] @@ -1251,6 +1303,40 @@ class OpenAIServingChat(OpenAIServing): kv_transfer_params=final_res.kv_transfer_params, ) + # Log complete response if output logging is enabled + if self.enable_log_outputs and self.request_logger: + for choice in choices: + output_text = "" + if choice.message.content: + output_text = choice.message.content + elif choice.message.tool_calls: + # For tool calls, log the function name and arguments + tool_call_descriptions = [] + for tool_call in choice.message.tool_calls: + if hasattr(tool_call.function, "name") and hasattr( + tool_call.function, "arguments"): + tool_call_descriptions.append( + f"{tool_call.function.name}({tool_call.function.arguments})" + ) + tool_calls_str = ", ".join(tool_call_descriptions) + output_text = f"[tool_calls: {tool_calls_str}]" + + if output_text: + # Get the corresponding output token IDs + output_token_ids = None + if choice.index < len(final_res.outputs): + output_token_ids = final_res.outputs[ + choice.index].token_ids + + self.request_logger.log_outputs( + request_id=request_id, + outputs=output_text, + output_token_ids=output_token_ids, + finish_reason=choice.finish_reason, + is_streaming=False, + delta=False, + ) + return response def _get_top_logprobs( @@ -1258,15 +1344,16 @@ class OpenAIServingChat(OpenAIServing): tokenizer: AnyTokenizer, should_return_as_token_id: bool) -> list[ChatCompletionLogProb]: return [ - ChatCompletionLogProb(token=(token := self._get_decoded_token( - p[1], - p[0], - tokenizer, - return_as_token_id=should_return_as_token_id)), - logprob=max(p[1].logprob, -9999.0), - bytes=list( - token.encode("utf-8", errors="replace"))) - for i, p in enumerate(logprobs.items()) + ChatCompletionLogProb( + token=(token := self._get_decoded_token( + p[1], + p[0], + tokenizer, + return_as_token_id=should_return_as_token_id, + )), + logprob=max(p[1].logprob, -9999.0), + bytes=list(token.encode("utf-8", errors="replace")), + ) for i, p in enumerate(logprobs.items()) if top_logprobs and i < top_logprobs ] diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 3c0b590b0c..f26f92537c 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -65,6 +65,7 @@ class OpenAIServingResponses(OpenAIServing): tool_server: Optional[ToolServer] = None, enable_prompt_tokens_details: bool = False, enable_force_include_usage: bool = False, + enable_log_outputs: bool = False, ) -> None: super().__init__( engine_client=engine_client, @@ -77,6 +78,7 @@ class OpenAIServingResponses(OpenAIServing): self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format + self.enable_log_outputs = enable_log_outputs self.reasoning_parser: Optional[Callable[[AnyTokenizer], ReasoningParser]] = None @@ -428,6 +430,24 @@ class OpenAIServingResponses(OpenAIServing): usage=usage, ) + # Log complete response if output logging is enabled + if self.enable_log_outputs and self.request_logger: + output_text = "" + if content: + output_text = content + elif reasoning_content: + output_text = f"[reasoning: {reasoning_content}]" + + if output_text: + self.request_logger.log_outputs( + request_id=request.request_id, + outputs=output_text, + output_token_ids=final_output.token_ids, + finish_reason=final_output.finish_reason, + is_streaming=False, + delta=False, + ) + if request.store: async with self.response_store_lock: stored_response = self.response_store.get(response.id) From 434d2f3f7ab3b6768df59f8d9d81e43bf38204f7 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 7 Aug 2025 08:22:07 +0100 Subject: [PATCH 057/932] [Docs] Add missing dependency for docs build (#22435) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- requirements/docs.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/docs.txt b/requirements/docs.txt index 4d4fc7da68..c589093110 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -19,6 +19,7 @@ cloudpickle fastapi msgspec openai +openai-harmony partial-json-parser pillow psutil From c2dba2dba8e4ebff1b7772ffbe811b0165e844d7 Mon Sep 17 00:00:00 2001 From: JaceyShao <65159281+JaceyShao@users.noreply.github.com> Date: Thu, 7 Aug 2025 15:24:47 +0800 Subject: [PATCH 058/932] Add H20-3e fused MoE kernel tuning configs for GLM-4.5 (#22433) Signed-off-by: shaojunqi Co-authored-by: shaojunqi --- ...E=160,N=192,device_name=NVIDIA_H20-3e.json | 146 ++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json new file mode 100644 index 0000000000..f2ed716c8b --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} From 136825de756f5421283e404e3991b77a9d33c131 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 7 Aug 2025 00:26:24 -0700 Subject: [PATCH 059/932] [Misc] Enhance code formatting in mxfp4.py (#22423) Signed-off-by: Woosuk Kwon --- .../layers/quantization/mxfp4.py | 85 ++++++++++++------- 1 file changed, 52 insertions(+), 33 deletions(-) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index b6d7bc5d5c..068af02739 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -109,55 +109,74 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): self.intermediate_size = intermediate_size_per_partition_after_pad self.hidden_size = hidden_size # Fused gate_up_proj (column parallel) - w13_weight = torch.nn.Parameter(torch.zeros( - num_experts, - 2 * intermediate_size_per_partition_after_pad, - hidden_size // 2, - dtype=weight_dtype), - requires_grad=False) + w13_weight = torch.nn.Parameter( + torch.zeros( + num_experts, + 2 * intermediate_size_per_partition_after_pad, + hidden_size // 2, + dtype=weight_dtype, + ), + requires_grad=False, + ) layer.register_parameter("w13_weight", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) - w13_weight_scale = torch.nn.Parameter(torch.zeros( - num_experts, - 2 * intermediate_size_per_partition_after_pad, - hidden_size // mxfp4_block, - dtype=scale_dtype), - requires_grad=False) + w13_weight_scale = torch.nn.Parameter( + torch.zeros( + num_experts, + 2 * intermediate_size_per_partition_after_pad, + hidden_size // mxfp4_block, + dtype=scale_dtype, + ), + requires_grad=False, + ) layer.register_parameter("w13_weight_scale", w13_weight_scale) set_weight_attrs(w13_weight_scale, extra_weight_attrs) - w13_bias = torch.nn.Parameter(torch.zeros( - num_experts, - 2 * intermediate_size_per_partition_after_pad, - dtype=torch.bfloat16), - requires_grad=False) + w13_bias = torch.nn.Parameter( + torch.zeros( + num_experts, + 2 * intermediate_size_per_partition_after_pad, + dtype=torch.bfloat16, + ), + requires_grad=False, + ) layer.register_parameter("w13_bias", w13_bias) set_weight_attrs(w13_bias, extra_weight_attrs) # down_proj (row parallel) - w2_weight = torch.nn.Parameter(torch.zeros( - num_experts, - hidden_size, - intermediate_size_per_partition_after_pad // 2, - dtype=weight_dtype), - requires_grad=False) + w2_weight = torch.nn.Parameter( + torch.zeros( + num_experts, + hidden_size, + intermediate_size_per_partition_after_pad // 2, + dtype=weight_dtype, + ), + requires_grad=False, + ) layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) - w2_weight_scale = torch.nn.Parameter(torch.zeros( - num_experts, - hidden_size, - intermediate_size_per_partition_after_pad // mxfp4_block, - dtype=scale_dtype), - requires_grad=False) + w2_weight_scale = torch.nn.Parameter( + torch.zeros( + num_experts, + hidden_size, + intermediate_size_per_partition_after_pad // mxfp4_block, + dtype=scale_dtype, + ), + requires_grad=False, + ) layer.register_parameter("w2_weight_scale", w2_weight_scale) set_weight_attrs(w2_weight_scale, extra_weight_attrs) - w2_bias = torch.nn.Parameter(torch.zeros(num_experts, - hidden_size, - dtype=torch.bfloat16), - requires_grad=False) + w2_bias = torch.nn.Parameter( + torch.zeros( + num_experts, + hidden_size, + dtype=torch.bfloat16, + ), + requires_grad=False, + ) layer.register_parameter("w2_bias", w2_bias) set_weight_attrs(w2_bias, extra_weight_attrs) From 5e8398805ed6b6e59e3408fe64ed37d189b77149 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Thu, 7 Aug 2025 00:28:15 -0700 Subject: [PATCH 060/932] [Doc] Fix link to prefix caching design (#22384) Signed-off-by: Yong Hoon Shin --- docs/features/automatic_prefix_caching.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/automatic_prefix_caching.md b/docs/features/automatic_prefix_caching.md index f3c4bdd85c..c529da684e 100644 --- a/docs/features/automatic_prefix_caching.md +++ b/docs/features/automatic_prefix_caching.md @@ -5,7 +5,7 @@ Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part. !!! note - Technical details on how vLLM implements APC can be found [here](../design/automatic_prefix_caching.md). + Technical details on how vLLM implements APC can be found [here](../design/prefix_caching.md). ## Enabling APC in vLLM From a2c6696bfee0ec2275dc08b15eee154ed0e9c0c7 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Thu, 7 Aug 2025 00:29:13 -0700 Subject: [PATCH 061/932] [Docs] Factor out troubleshooting to its own guide; add section for Ray Observability (#21578) Signed-off-by: Ricardo Decal --- docs/serving/distributed_serving.md | 29 +++++++++++---------- docs/serving/distributed_troubleshooting.md | 16 ++++++++++++ 2 files changed, 31 insertions(+), 14 deletions(-) create mode 100644 docs/serving/distributed_troubleshooting.md diff --git a/docs/serving/distributed_serving.md b/docs/serving/distributed_serving.md index 08d889a00d..fc9d9f8a34 100644 --- a/docs/serving/distributed_serving.md +++ b/docs/serving/distributed_serving.md @@ -128,12 +128,17 @@ vllm serve /path/to/the/model/in/the/container \ --tensor-parallel-size 16 ``` -## Troubleshooting distributed deployments +## Optimizing network communication for tensor parallelism -To make tensor parallelism performant, ensure that communication between nodes is efficient, for example, by using high-speed network cards such as InfiniBand. To set up the cluster to use InfiniBand, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Contact your system administrator for more information about the required flags. One way to confirm if InfiniBand is working is to run `vllm` with the `NCCL_DEBUG=TRACE` environment variable set, for example `NCCL_DEBUG=TRACE vllm serve ...`, and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, NCCL uses a raw TCP socket, which is not efficient for cross-node tensor parallelism. If you find `[send] via NET/IB/GDRDMA` in the logs, NCCL uses InfiniBand with GPUDirect RDMA, which is efficient. +Efficient tensor parallelism requires fast inter-node communication, preferably through high-speed network adapters such as InfiniBand. +To set up the cluster to use InfiniBand, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the + helper script. +Contact your system administrator for more information about the required flags. ## Enabling GPUDirect RDMA +GPUDirect RDMA (Remote Direct Memory Access) is an NVIDIA technology that allows network adapters to directly access GPU memory, bypassing the CPU and system memory. This direct access reduces latency and CPU overhead, which is beneficial for large data transfers between GPUs across nodes. + To enable GPUDirect RDMA with vLLM, configure the following settings: - `IPC_LOCK` security context: add the `IPC_LOCK` capability to the container's security context to lock memory pages and prevent swapping to disk. @@ -175,21 +180,17 @@ spec: ... ``` -Efficient tensor parallelism requires fast inter-node communication, preferably through high-speed network adapters such as InfiniBand. To enable InfiniBand, append flags such as `--privileged -e NCCL_IB_HCA=mlx5` to `run_cluster.sh`. For cluster-specific settings, consult your system administrator. +!!! tip "Confirm GPUDirect RDMA operation" + To confirm your InfiniBand card is using GPUDirect RDMA, run vLLM with detailed NCCL logs: `NCCL_DEBUG=TRACE vllm serve ...`. -To confirm InfiniBand operation, enable detailed NCCL logs: + Then look for the NCCL version and the network used. -```bash -NCCL_DEBUG=TRACE vllm serve ... -``` - -Search the logs for the transport method. Entries containing `[send] via NET/Socket` indicate raw TCP sockets, which perform poorly for cross-node tensor parallelism. Entries containing `[send] via NET/IB/GDRDMA` indicate InfiniBand with GPUDirect RDMA, which provides high performance. - -!!! tip "Verify inter-node GPU communication" - After you start the Ray cluster, verify GPU-to-GPU communication across nodes. Proper configuration can be non-trivial. For more information, see [troubleshooting script][troubleshooting-incorrect-hardware-driver]. If you need additional environment variables for communication configuration, append them to `run_cluster.sh`, for example `-e NCCL_SOCKET_IFNAME=eth0`. Setting environment variables during cluster creation is recommended because the variables propagate to all nodes. In contrast, setting environment variables in the shell affects only the local node. For more information, see . + - If you find `[send] via NET/IB/GDRDMA` in the logs, then NCCL is using InfiniBand with GPUDirect RDMA, which *is* efficient. + - If you find `[send] via NET/Socket` in the logs, NCCL used a raw TCP socket, which *is not* efficient for cross-node tensor parallelism. !!! tip "Pre-download Hugging Face models" If you use Hugging Face models, downloading the model before starting vLLM is recommended. Download the model on every node to the same path, or store the model on a distributed file system accessible by all nodes. Then pass the path to the model in place of the repository ID. Otherwise, supply a Hugging Face token by appending `-e HF_TOKEN=` to `run_cluster.sh`. -!!! tip - The error message `Error: No available node types can fulfill resource request` can appear even when the cluster has enough GPUs. The issue often occurs when nodes have multiple IP addresses and vLLM can't select the correct one. Ensure that vLLM and Ray use the same IP address by setting `VLLM_HOST_IP` in `run_cluster.sh` (with a different value on each node). Use `ray status` and `ray list nodes` to verify the chosen IP address. For more information, see . +## Troubleshooting distributed deployments + +For information about distributed debugging, see [Troubleshooting distributed deployments](distributed_troubleshooting.md). diff --git a/docs/serving/distributed_troubleshooting.md b/docs/serving/distributed_troubleshooting.md new file mode 100644 index 0000000000..bd45f010ed --- /dev/null +++ b/docs/serving/distributed_troubleshooting.md @@ -0,0 +1,16 @@ +# Troubleshooting distributed deployments + +For general troubleshooting, see [Troubleshooting](../usage/troubleshooting.md). + +## Verify inter-node GPU communication + +After you start the Ray cluster, verify GPU-to-GPU communication across nodes. Proper configuration can be non-trivial. For more information, see [troubleshooting script][troubleshooting-incorrect-hardware-driver]. If you need additional environment variables for communication configuration, append them to , for example `-e NCCL_SOCKET_IFNAME=eth0`. Setting environment variables during cluster creation is recommended because the variables propagate to all nodes. In contrast, setting environment variables in the shell affects only the local node. For more information, see . + +## No available node types can fulfill resource request + +The error message `Error: No available node types can fulfill resource request` can appear even when the cluster has enough GPUs. The issue often occurs when nodes have multiple IP addresses and vLLM can't select the correct one. Ensure that vLLM and Ray use the same IP address by setting `VLLM_HOST_IP` in (with a different value on each node). Use `ray status` and `ray list nodes` to verify the chosen IP address. For more information, see . + +## Ray observability + +Debugging a distributed system can be challenging due to the large scale and complexity. Ray provides a suite of tools to help monitor, debug, and optimize Ray applications and clusters. For more information about Ray observability, visit the [official Ray observability docs](https://docs.ray.io/en/latest/ray-observability/index.html). For more information about debugging Ray applications, visit the [Ray Debugging Guide](https://docs.ray.io/en/latest/ray-observability/user-guides/debug-apps/index.html). For information about troubleshooting Kubernetes clusters, see the +[official KubeRay troubleshooting guide](https://docs.ray.io/en/latest/serve/advanced-guides/multi-node-gpu-troubleshooting.html). From 35171b1172fe59810612ac35de9ee29ccfbd8b65 Mon Sep 17 00:00:00 2001 From: Andrew Chan Date: Thu, 7 Aug 2025 00:29:45 -0700 Subject: [PATCH 062/932] [Doc] update docs for nightly benchmarks (#12022) Signed-off-by: Andrew Chan --- .buildkite/nightly-benchmarks/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index 3721d3d1d6..3f2e2da397 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -168,9 +168,9 @@ See [nightly-descriptions.md](nightly-descriptions.md) for the detailed descript ### Workflow - The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. -- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container. -- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark. -- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite. +- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container. +- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`. +- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite. ### Nightly tests @@ -180,6 +180,6 @@ In [nightly-tests.json](tests/nightly-tests.json), we include the command line a The docker containers for benchmarking are specified in `nightly-pipeline.yaml`. -WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`. +WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`. WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git). From 289b18e670c2439dfc1f4f80df782de9ad112762 Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Thu, 7 Aug 2025 15:59:23 +0800 Subject: [PATCH 063/932] [Docs] Update features/disagg_prefill, add v1 examples and development (#22165) Signed-off-by: David Chen <530634352@qq.com> --- .../disagg_prefill/high_level_design.png | Bin 0 -> 92801 bytes .../features/disagg_prefill/workflow.png | Bin 0 -> 89969 bytes docs/features/disagg_prefill.md | 25 ++++++++++++++++++ 3 files changed, 25 insertions(+) create mode 100644 docs/assets/features/disagg_prefill/high_level_design.png create mode 100644 docs/assets/features/disagg_prefill/workflow.png diff --git a/docs/assets/features/disagg_prefill/high_level_design.png b/docs/assets/features/disagg_prefill/high_level_design.png new file mode 100644 index 0000000000000000000000000000000000000000..ce9b1c88276602d08b2aa4892e4c3e3aeb4fe67e GIT binary patch literal 92801 zcmdSBbyQp3w>69uDQ?BBNE_T8LUAZitSwL+inVyKpryELaVbt)9E!UI2!sMH?iM^i zaQ)Kf_uTv5Z`?87|KE(0k(}(2vrqP3d#yF+o-2{suT%)}Xz|d{&Ah~_`mYnfOKFw<*8kB6Vw8d}_V!|mIWH-`e_!Mg{coGJZ4#SY)W%Tab1gBa0-j!}jgH(S`2_L%%C#|9;KMCwZz%-7G>=i- z1?_CN9p9PBd2*T;yYZB!-1mu+THAt(ogE;KNh<1B0*U+1#1l;H)5b$zA5>p-VorTh zh_+y7TL1X4JpcUv?&jM<%Vu2M!Q9~O#ld_-_tMf*%X14d3W^Zj;+Ng?^K`OjbP@8Q z(SsD|N;+h+Df9RD_ta7!9ue&?NB2jkeGRzts_@N=3s1Sa!l!!gUj2S(;#&9LNi~(^ zNGg31MnJQ=x6b3FK})63)f|X4mkzKI}L1BJLbvwfYanl+1rlRrlzkZ zkXJ5lo&!y;>*dbjrSPZ60fWj6ieSacd z;syGHJN&OvlA!MAgehO!)zedc_ccc%r}Z#ZM;psn8lM&~#Yi3w*2$)FoP%MxFQeu$ zOXy~@l!_>-t03f6o$60t9NmM3&#zk>+J({cZZAoj98AN7v!=S2md9GoHrF;b0Ap{4 zQ#eLIXWMeL%DfHI!5TN)wh8iC_E&0$`}99%QsHw$C&tL$+CC*g92^QRUG3Zyb^dUs1o`k6-XOu^ zsCJ6gp6wKA#H4VNr>ELR@-NycHZRr@EEN{DqANugwb-lL$2%>}Jp^Tp&#a3-9SlZOsjs=8+`>GDmlwD#~Sje;Xd9Rf>* z9g>2T6%|)DGC<;i$F;Yo#JJUmU{`T*wkK7|`-Xx;e{FAgebwA?hpazlhBy3Vs(W@= z*&At_Dg4dK@uS`5Xc9*uHRrZHHMy^mlm)OPw8lY$xkxpa{tag;BvxMX`h0xk!HXd1rD!UhvYGJ2NW$&6l7kzU$d@OUY_L-50p58(R zc>y`Y&wY}bhRw0$HSKP1c6pUjs(^B|Y|K49Ju%uXpFfH%UpJ)Ev8@de%>52X8_l$rB{~Y>C^lsw*&09Hh&Tt8vDg-OY^HF3AW@o^1$iC zkiS21C`F(>;?&)q$7x!$d(@LzbN4sEs5af(ub;x6L$ByXkRMpP`h{uHs|-P%NO9BX z9MV2rCN)N(OLI>(f5W{)oj+Zvk!Qp8w|!f?FpwSvh%>7wS$Z9I8W&FG)U+HHUMf~; z(i!A?acFqaMs4s6NZf{)t+jde`a(v)e0kdZ&IIMJW#3jBN4A{p_WpT;Ny72c_^36J zw<0>`m9FaY@u|FvON4`$0N)-47}Pr|!qDdLSM182AWMu#H4;CPA-Hn5=!;8$SG)B- zh=tSkyj%x4C$Xm1C>m9G6oEUG#`+Ts+9Vl&LPi#LCCn3?`^ct3M`B*}=Q5((NI5bTV4lzh$$ zT(P2~Y(3%^&jSXfw0be0b(p9ItO1ftI-^>hc1U+N`i7Ns`Lv1_I_X}e@eT@HOf(I8 zLZDQMCxhtX3mu^mF&PQ_Q<_C{t>oXloPyUw>zdM7p0MbZSgLbDNLyBAtkOU-?<|Zuc5s;0q0?pYFjNC;C|u z_T{?i_t#AbI}R@m@s}{)BoUA@o7SPjq3;nUOWIpP(IkS)0pC8~VYJ}33k@$wFZt7a zR~(pC4pM?GSNR3bBjBa6QaGgtFu5U!N%WWLQU`7wP=)*B(t?Cin@ak}&3k4f_t9*e zV|7GMI`>wK!M!^#@76-9ua;{LpVq7Quymb61#j;VX*2fdCjw*MqF~r(d3i?ffGa=b zyBQsBWB$B8J9EPfvuZUcLgl`#xz9g=lDWjz-|ahEu)y{8mAZe8#z^>dqLi&CNW*ET zNiq{Yye!MSE9u?#V@S4><9R9h<>j@S3?-Mb_HY+GBNwafetQs{lan}^62G|t_9oU# ze^q&K?O2@+2Eh{$prwV>S{3D`v^}-`VCSfl@}s+IYozjm$a1gEXy(ujM}E&m&-XV! zTrQ6+{4S46w-#3%7LjYKzGd#m1t&oGO*Cc&6|YHm#XMt)aiV1++&B4-vV(P&XjV0Y zr5PMwns(b<+&-~8)>229i@a*`fUM^Ob7EUdWX=Wr)f)O6&aag$kuK{i z7&lDx(!WIauDakhHw_g{_Sau!e9!Bh{3U;%R08Bz4hsf(r zsvZ@aG`%n--4&g=9Dl;)1*2z?o=LCnF5U*bDNud=i(h4un%^v-ptZV2j@O#j7VjC+ zvmLMziBfnt1PAhKOYI*WFI5?izShPY;SBp3~^rwXgq*&aK#J<5XrA<6=se9}>r}O;t zZ&iHoX9jVRSa@DU&x)3s`?C>(`x*+_zsIPCv=~%co&HQVRG8q)egQCe#!<;rddBez5~bM zR^aYTlzf59#*_Z)$D>L>C-Ahn%)%f~nwA>HbQm%Tn_p>#%@yt2TFSjHmq*7<~_ zQAg~4U@*5_%&;0kNH=B;#ZPw$6KoA$-QFI$RDYhglNq*`WVz*?%hk*kY9>9E1}2m4 z(sywhmRD=AuXr4_csrQ}6EgNnayoX5NsM@f1l^^l2ICrPKzIXl7p77ur`1}@1j%Ik zJ2s_~bceh4UQ;N6X*L8A=2CmU@fgS3k&`R~(y<0IphJvG<96Pj7`>vCjbUDdM2FUF zZk^-Wa^GlMAad}lU0j)HukwaWL*qs9L-&OY+y?Yj)JBrdebP3o3w^QRS-X;=Yqnf3}vTYDh?{GMgA2Un4VQG2DAs!*Ii5p5s=Xs`q;) zRmm#{{uOs#o+wv35@17@)EnVHg@a@0P~_p7_;EN@B-)tuCW^k^hOYN|6YO{~7FM5A z(r!&R5@1C-2|EVX#WlxYgn7KD^BmA0CJrSv<_&)akwd94dc)| zH{NPqmDzALtb1QeZ*?dWAIW2i0Uh;B$6RKQqD(#YOUb_o^{U!rxG@u%sp&; zPX;G$Wn%|)ge30z@=@%8;j!4`^r}s(Sw*`$cTG6Um2SE#(Mdm4f4%-IvRHXf+H9(x zmr)jsjT1*7e&fhmqB%eiL59Sl&-p0k^iq;ZRyHFraChVi5Y0zN^Dd)VR+PGXF_GnZ z$t1)YhKKMWTtUKGamUhW^1&zTrkvRKLXcXy==+*(v(aVG{Ct-dZz3;~iKo@?4HeVZ zH4(iVzQo_o9l~>(dF7!^$Ywp|dKG4FBO-7A6Lx9YE4+)O(geSrN!t*!FJdlYIX^aS z`hq5}gj__Kw#c1^4=$87FW>m$Ut}k%8)j=UCZmU>Yq#kE>Vj;n7-W2;aIADQ!xBA| zqNNf2(9dI((7vz?%M5GPoMBvjq!hy1_<~PTLyk(Wt1@--qV}rzL{kd_g6F9Xw z=d3G;wFjP1cnMeSjyDO7*1f0(#?j;0gHullDI1fjqQ+;}hdHop6{P8KFK+qW{)%Ei&QOI`?XW6kUT-lA?4?w<7mM4p{}Mu~ zVISy~Xz)6+K@paf%VFx7uT%GiyR}5y-ZrEVRJwf1K7)D=imqrHVEep0F?Icvb*GGH zLe!KZJWt(~;GsdRm74sEy1yGiKP$)@y0ctk3MQ7A8TB*ZC2A3p^{G9ksU{pDCn`Du z&JC+#>U8^oxgqJkd3;_(GXsO6HS& zw|cpjzmoqksUkoB7cnj_iH_SZ_sc4}{9Ax`_NXolnIq|Iy2Y$_xi5wR5Cxn&^8M_E zXstGcKlk&$bD5)8PwaC$F8;u!rvZ5Fc_0%_Tysc)tMhKAMV zP7vEx=3&-HJZ>{J#hZ2t4wB3&w8Jdy9-ufbsdCKn&z4OHcEzE4Vrklo5YnrA<^Aw2 z7;7+LkTus}wjU&G+JcGNPG(Kco3FR0r&!K%-EwclMG^~3MoQZ~-nMkjI-!p@5)A86 z0_csjG!>=7gWlS-$YQ78&|PIqiqR*C`yR5y<%m1>9Xv7i>SUjv9k2DUv3pV*TT(~V z*6!c$k4z$MhrG&7MCaJ*mE+Lj^RYQP&wN!n8+g5smEGzh@m*oq7JPH)&+>hTau|Uj zRT^@ZD{$AD>Q3k9WFI-_yArVhJ`LH-_6wuI{Ira74gscz#A zxHwis=gsA{>`DOxJsF-e^1hbvRyo{~M>A6xR{-&12_8)p z-W{KP9JgKcF$GMb(jld%N}>nt%~kZ&<}~S3W7)w+0h2#qHZ#>=Z0(i0_%6@i^AN2$ zyWEEcNarNJc!CqM%qrL6qx14{0{qjNCREa*c{{OdhqDbi?cAb1?~ujWZBO#}@n7x$ zV>LR3Pp-pm=EaE>o%QBT8sD=HT>G63{3Jf{xVMvc;f0?!Tw@8$-uW4lMd5TLfC*$N zglpu`dqFyY@e}oY7;$5`VFhG-@x-d9O(HM1+tgbFi2S@U_-&Mj``TPl1CY&{Y-l=> zy_aiq$H^wtGhMHtY%aUMbqbJ2Qu2e@lh-dN%XFbc%}(m=vp5;pJt_LB+3P2u5bL_f z)<~()c86%pmfgvp%b)E@s?0(sLqV2TjbaSwL(9LjObqrxlN^s6k&80`*GK7R3i#Xz zCOT+W7o_!@ET6ZNXa_|Py@_pxUB(Ko?^iJ86@f$zFb~sjU}n||a|nLC42JI;7#P!K zgfU?D*k(Y9$46n^DM66gCg%=Won;(ka3?76wlLiVVz4hDkg3$Md<&BB*Xq8KqG&56jLz`!Hs=p*NaEs|vsX^ik3xoYs3{nZ4 zQrEBYoAgnxKXGR3but>O--Ev%5O*F#N0I9HoJk5y+j!`$+-@eJ3O>E$$QJJpx(-~1 zhS1WKxxhstM5PHsaSc>_zgOP82+=7>-hWq*_D;YI16CnEAH7j|EcKF zOxQZ0r8(A70tX&XFUv7qf6--uY%~7iJ8}d4R^XQ_zU^{3^Vqn>?MbP|y|CgC^QY)& z0`8hP3IDh=IW1hGU9Kf&L|>z57R%6)EpZW}4m+|X!fqvR z3{+=5Eh_;`NHiNj5hZF|pXbK!u#H8_!}gdudsktO-}yccLqK`KST(|a+iE9;@JeLb1a;&@Kfz%Es@&gV5t>i{V_ zyEt0pb9m?Y@JLwrTjwbn9B0!H;XT>O7YT%^1S{#vqZ6}goNMofPs*|`Q?%*lt0S!R zw30eE*~+O}27IqBhnMRqf5{yFtYwcqf>D07<>fVK^n2F)nm7+0voi_ZmR_CRCc7qy zzZ^cIUShqk^Pl$q(QCOQjrYtE4;f<=poxKX;SDBS#5>lvQJnd?`azeD?VX_okibr*3S&x3)ujygT~9;Ul4r>0+yh75n2n}#!IB=@+pkTh|zD#(syY)^uZK2{~pQeNUzk`1K}|xb*VhA z`75yFt(v#V>4v$aPtQ1sT$LEms*SP4ie_fnnii-Ka!22CJ9(C;Po9K^Ir!aMKgAo( zjNY#sA>~eB49OYS*zAA*$aN=4sgO2zrn-GF7rO(^@z*He8b0ymqacR*bYYXJox4RL zdm^7)mhVG@$xVUjLVRCa5u!vBC$9zZv|5oyvB{c*)R~F;lW^8_6kTh@xHqFBt;O)! z8bT^PuD2Me+Ir2@RBpsR<(@0`)$J$1mFc-dNI9X7;ye9EPk4g9k`-@~8`KhDsHchK zoiy!{d99FpO3!?)4pOsLY^_TcqP~r}K)`a+;DQf)7yUB1l?nSA{FlshH}mdHH?s9s z4!Jh(Eqtqo944g`%`gDH|k4YDi_ej85;nCrR2{T`rb74>B}+BqZ+k5+S0c z8bfPjm#Iw9xt+Ir)qiwp8F&-V*)~s3P7;G25xo*xN*Kk{c&fFony_Qmrjb9;k(HO% zS$`Mf+FN`ro~}WRM&xpoD@}a-%GJa1Fbns3V^r zg%0TGdG1~?NOK}Ru98-xb3wn>dPI|6USdSkSc`~?zqAi7{7PxzdJq&pibqS{v0*p5 zp(R>IU;t5$=-G;#R|J{Vcj3^D@9C9x;F`Bo&f{Qp2l1XNpGu2qb*R?NIAYSYoMgN1(Fyy0UpCdL>u}@HXx&m3Oz3n2%$K14=nVhuMl$Zo-DbV zKA6CM*yz}YN8c^F~n_#Ot2eK%hcSTmg=-mzo18kkT$K?+i+~F#X-t%*esqsu5LSWMX3`|TW z$v}5Rg^FCGW7eLu$m?)?+sxd{YZ#wSd6Xbj(!YY&<_;% zLi;8dk9<5xj|haH!(+pmei=2oc1WA-vF7BsPP?OxU=54H{-iaZZp|IU0JJ=zCfrv7 z$S5MjxA1UwPD$3-x?FIl__Pdf8W=^9CJda8%hDI+Pb;{yIiAc+4s14ekJ}NE4WW_9 zph>qC>7CWCB1em@W`e$=m4>n}2-oYFvQPOG0~%w0*Bw%OC*}|a6p>pVGBgT_bYh0^ zW)Qr#0_ERURdY`}FD1!w0^u}5C26Xs(!`~*E{6wAu#MI68Rr2RIl{Zl%NChWmXNhXc3j z`kJqW6K2ePh=liN0}alPozKmzC6p$;Q~`et;6cdGf`T(aOSz+2+2_yj6K&M)4p@Q= zk1MO6PjrUgh3yNUl>7yJ_{s1~M*1f|9pj035ELvSYYB(SRr=iFyKgw7T1lse<7T_& zSD=4k9Lg%WKJrN<*UQV*EsyUz4XKYmbVg1go_A6rkqZ zIDy$)?LEw=t;~9ApbF7rpa&to)ol~7UkWsPHM{vQx0W4 zD~@^sUg@>gv=psvGc(>3v3XZnIE?$lwc9pmv*1Zc;g*&gah}*I_bM`3%!|l%)wjaf z-f$w6;iYFo69qlAU+cxEP6E#(Uxj1ipSs}J0HcZBei9ZBjvk<%@@t`Gc9=gLF}B{4 zE|#XpMo$3H<0L^p!ehhWM-|chH-h0A3cfbR zAKa;2*pZr+5Ek@*K?XDf8#SAZn*-K^ysXRY5sra^xr24WqYgJs!;g`r`JcI}_?@HF z0>r<>1j_EUW**R(!25S|VM>Zy>&9CWb|c=h`@~8QeKwIc?ft3eIzc|ow}9X_(kXP- zo(I{Lwv*Ex9_Un!^Psns*rGEO2#anc^r<7Pd+Zv?MBctKxqzi=K(#L%O!v1hEGpi~y>~!?4 zUlwde$Ht19X$x4~D0z9cK#{k8_Q%Cg5Yk!sY5b3?jpzg&6$v*d34L8PXj!p)cc5{v z`o&4YEfx>7KJ-y<_q0lgNT@)Xd25W zXlrW*(t2(m3VDc0)Zi`wYrC zHN8V`UznMq5}ob|@~fPOovli^jCIrX$bHi7j~+(C{O~rQtcNGrJ%jKZM*oL+jwcUa z*h+p8pVmLu%8i+-FtX|?vL7`2+Qlm6A9eD*mrBHp_m;B8YE0|&zyaCB)54Xgq-|(y zcjZ>=k2U)ZKJjrVC%HE_R?{T{oZ%8Fp?sObM!W(2YCvyN-;LlzZsBL9zQudQ2t9BE zwoqpvmBO!%dvS9(vUA=qYQDQJG*fplfLTk#ZEl@dwkb=bUAY9%(rq`X_Lz&7aI4+j zn%gS=Y5)DrHJ(q0)yVvfDYNE>zwRzCVB7>+h@QX=KP4bZ}1&eH|Lk3a@nno2`A`k!TsoJJH~G zq_6r=vYNzKdtaeu3%$^}m^m5Y=SfqFmcTiXa~N|x6;zTnHC`z&Dcqc`Nm~;vBZ`0O z=ae4b-aO_PU?_^WJTq)he&Ba64#@${A7;QmPY>Sm+-N-S$rc=xHfuPLj_aJSRM~ws zd~fb=FlOcPJ&HOo0VfV_%v9wzV)QH6IcQBeB3sIJ>mIQ01&{% z#eMMrY$qY354pR!U@g|*QctfoK}n5xY{qh9oAD`Gc}gwX6)!8H8LV7xHc(3TDM#&k z{v+b@A=&vkfJ|nEX!TA) zxxU|q@XlJleB%2)o0-UAlE2%tL}SQO2YuEu_+xn8zVPRxxmn=R42WJ^blL_rNE@^c z(>j1aQ$z9`5Afxsnmv&w?WqEFRqr3pO?Wi&} z?GOx&37#i-uYrlRm+vsNru!^46gqn>u`VGlu3PSczQdO*?aSzOXdpd5bMMw-m}@Li zbm&;^a7roedYxT=p!*7$O(D{glj>na+aov*7-+uScSboeH2jLw@jT3PA!DtHleu>>Lk!oqY6}WctiJKOR75#t2JuODo0Y#2_{@w%?ry6;jZ#bjN#2x_BjtH@N zG|u2uota6!v4(hre3X#7U(WBOc%CvkrR)m&q;S#<4Xv?nDugVM5RBO1)`klw2Z-K` z5FwO_ib9{Pv+p;$(sp6$ug!W?ZqLqx6p`gHWgTgc->Vq=hU=~l7n$5;n3h>V<5%L| zo1{z`YskUKyaHu8N{u-RRk|EyPj|N&GUWMS0R?!*hV1SH zLq~t-m6kT9Og8X=lm|eoULqvGfcsRW+ffes=U{I5dzm_*Hv#hNc0+ei2E~w`ssuN5 z;m$WcJ=fHO;29+7uKJe?h>xXDDCJ5yFxOj?L$~D9$9Gg9mr0E0KL)AcYN+b9ZGejZf_>E@W)pK|637AsPynQd0wk=cI?x2eZ*b zdZj|LOka3`e-&&gikgaB+(!Wjt4FDT;G+>krSfpwk(d>X&{h32p35mP(}k{4+IQ3@ z)?b-higy$WEbEYo5WIk}Z+}+*61tovvTF1r_m+V3f>s{6QTM1&Z?*D2ipW0DU$N%N z%dX;`!{;8=ie>Pz=P^Y8$|V8E=m*W5|1GimUsBHhtIqTPt;VDUF@r#=XaBz-j+Vrx zX1Q1hsN%giTnvWfNQwJxkhDolldOL%1Q1);radyOwNa?CWKR$^k(`;IH@f&crtz%I z%xHHvr}C;UBvK>Sl-VT$O$y!gQTD|}K?QWtmkkN~a}2q_JCxvhw23<@_32yXm1C4WM} z@mE2rlcJLU*HHSxP*V@?4hap7ba~htD+cISa)%KyL1zl(2RdOootbIZ*#4`e8j;i_H z5lalbyu6$XUrJf*CyEABILLt?XGrp&K6U8?_qM3~t31smdczV$hI4^X>6t{vJ)#AH zKy^-UQc8m{f#8#k#N}Xx`+2|*R8N18zaX-y$=NmMxum}$n9^ZvEI2vkKWl(ub4Uj@ zjQ?%A$MpYGDoQr?zZ=PjS0+e5PC0e$itM?-3Yg7Wh6r0?U_yHFy&{|v-~*?%tk;|hWQ&^4rIyRyF#GAmSS-tAm>)O~ZN)8Rl< zq?NanIl(f4+Ok|--3gudI%Mvlrz0=PxAW}upfg<}Hd<|79vdTW90>>v4BVT-S$*;8 zro!l%iSdmBo2L6;FUKte?MH@(h8i}$!+l({(Jgp#G2e(_{#~@XENWo3K*h)yhmvO> zzd8~=BNbx6p-T^&$G{9`<11-=Imwr{htyw@gU*{cTp8p@*Nt+S*YSNOBwbRSU z&nO!ff9K4ju3pLql8+>i6xAHj@{$9xjD{;=N&W!JvH)k zI=B(>n@}Z=zPB+pYc)%Gv@iJ6+2F_979G8B3Z_;FAm(O^!*s8(Ei5%8q2uEO z#?n%%J;AL6GHrH?cElqc8F=i$>abAH{(*mOAU4bHSV5}!)f5zN^AHwm(@AR&cWM zZF1Y;oI%{1+Vk8~6_q#{1e+L7sM5M!9Ln5Y&-<~dB`HoAH~YNV`bEA~1BQwc&`T5C z-=lli`G^#$CK~tM%z4nU5n{d1^2~8Lp0E~Y(~Q!f5we8DnbeM9S4=OA#cR6P(cj*D zC!G`E`6ZfO0%|lhX!n$HgO5Zev8%64Y(lqx7l#`R4vl6VDjVdz_#t{gN9#Rwp$KM*SxdNY<4U?8I-TIX7>J=6 z5s_H=I(x1#PlT$GP`Ssravl3U+j2(ts~gw<-Q45g{Tc%OO4H!vWOCgS4U}>qFDEBw zL)LA?dn6N?)vQ7Uc+3@_8};4XPZ%O@X@&7o3h+|2)RfQ_^=nv$(IX%i|o1u%-?{rT~RbWgD2 zuB_LQFaGX=D4B`3n5q_+d}j1+slnQXP4bm`(q%rgkLoQZ;^V|TgAFq{~O|$ zerTrC*=8kA+p~UXpk^z$u8utQu6d@->r)HDpZiRw+<;Uf@5c`=`W*W4mBO)NBJA@A zwP7WThf%Ll#ZPtR)-lwjBDV|8$TGt{NB4snA~j7zR)o^9f3wF;gSRk$m3;|JJVFrO zcy)O;ePuU(>+5%Yta5qX`b-c_C}0Hln=PIX5t_?8#Ysv>wvz7WOd<)V=bq;-f}5`r zrNJn+1T|~58BM~siJ-fAu| zk?AQQ01{)t00My}&VBaQHFS?C`mt|b4Qb@@Ytm9pI%`-gW{T(AZYRiLRy&VXKF|D4 zONa7+>_oY*+sFfAsaFn8zHk}X^_^Uw(XGM}@dPvzRtwv!gUM1V9Jwc8AIu5{CW7!VF6vG-;^35@jiJ2(^DG)-#%3wq-pnh7bXv&$_7S=)-%ZFq$jwjj9Gl*?T%Oidt?t43AU4p{v!L!^k3F$` z{k(v!MVoMx*+-q_TtubbO8#7yt2#6}x+&;S2ZBWD8FCV2j3`fc$DdrCY$f{En6FxD zWJ@o$ePT@zQXz6boFPJm4MjKBrS*T(qv6T?H5m)HvtjFzY~T|rKGW()!~WVffS25C zi9p9Wcl+ZjDzie%;$av5yNbSuzu8!jbgXw{cNKZXQ9Xgz&XQ58;mu0B*Lbs@)Ne7% z6rG*GEFsKL{3rE{7EfEe;r9hfcgO9sGm6!B1vG-1P)8{y^D|Bh{0`4m!E<%y;h%=cMy9Nt# z#7KD19bl<>#B}ijnACi@_m;0N>LOmL&3U$oc1N=r7d|F-dm?pgU~qr2R(2p;e-Pn_ zatyuLOe;5tj-=*o_jxmmN5IANLLvVF2Se9#tD1csyN_U6QO%@V9M#Va%ge%`2zTK=MMuRux!l!_X&H6w;$PY z?lVo=-vN#2ZEbCe+GzmO5nVJ)I0O`Mca&(w@3n*|dwb@gaStxv&yw*1hQ(7eurIN) z9A^5NrMc-z3*{9D@+jm5V6qE#Xtqr%J?uAwbk_XuKPyoKKXC6(RffhL%maThm+}{g zWS~-3r^~ZIGDL$Tsz{=1Sv@e#-90OwV@cxjxN`Pz4<5dwC;sGXy@sB5r<{Uurzsk&MxvK81v4F|(e0&6c z)oSU$&_5AoH4Kt{qM2y57^sg_&EfoMtncle{w)uGd;CWg!8455@5cjj)H^RfmS0Q(?PNBiMXRO!TNhUWrnJf;ncN+>|2>}CI)(Wi* zNxesRq4-o45;C+ffvRI057I@2SnB0!*Se+=Vmb-hG(M9=?31%;t|8oKtiOqwBq%3K z#(H8h>MA4*R8vRLhctsu@2KrgJ3mWODv1cv5*{ zhWuTZp;If{Lq~}LPQ^Rn{!*quTqZMF0vaG*bSk$}>PUuw=c&sMH^1iOv)H44+q{?U zRfesmFu7^Jmaa)5%aJk*GPQtk3EAL7ohDFoCse<3*uM}51Lt{Dd@g}g<<5~_m6K|+ z5q8YxcGIk%DiiQ^ic?7G)SC3eMm zp7;ASdvDpF?TS_bT#tl7OZJ2pf`n;lrH%cB1AfjEMQS|gf9!ak-F_c$@w*$)e1wGs z6^L+Q!2Ei%DNA5~r-4;98F|<`MgiZX@QRuu zd|(>ydaXT0X1%Ievl}6YUa?}_{w$lOfkBhRvQkL;iT073RD82DEJMth@oEWlFOg;( z9f6xBaZNGdklu_fxj4O}^dcWtc8w}kOR`GOg^yW>2uwoG?<4RGY#*JJkp>sPN}7AF zRH8vMdR8R#dHxlww|H-REl)O&UEA25+Az0!&mf*+yRy>0pHQ}iOBNc~TVzphw6?YT z!O#iTEq54NMfwmU9e8!}W@9i>6BSpE7smV+ofjK;WxULW`&T^OTj32xQ~o32pMjt) zjft7QvyRt|;J*cX3X5Chlqy6_Nq^k71E_dnk9pq3F``wd~elcuyZ zaU&RI-()xs?BuZ1Ev2?&Eu31 zz`{Cfxe7xOs=DEyicYDy8-0q+VqNAyphk``kplKvXdP&wd*@%IqI5;|7x%ZlrpEv3 zkTC%;EiY^N^G^x!$!l+KpLc6GSU9rvy0E&gbeTrygzBD;&PKt(mgSUfzar*_xFfoyyJ6*)_m8=RB@lvroIZWske_${#iW3 zc)p ^QXBQRGf@HVn!)W~Wodzmf03MHETC;I(^%1?Uot*;S*Z@(HsMK8Z0UYzbu z631rME4Id&tnO@{DOzSj_f0zk6yE*-$Ax)?goO@$5Nt$vGp})QZ+8GYngZ6bF>Wz) z_sl;R{6qfyOQe{J8)JL^PCu?)aLn=gP@wsP5}?i8*_0C=76PPraemnD%TmN-{iV}( zLi&TO$x6mEUdF};L-P*qhjqm}nYss8`=$rAleK^?Dh8rVX&BdwxadMg2YB2^*+z)S zALfaLEER?yf5Z2GRojABv!%iBj5q3R$HFK_J0Q=Q#JZ}V-U)oTBzZraBJi>e2Mfyz z6)Ei&8!$74AJBVMVtoe%$0W8oN6kJlX=F0ZxhDh#ZDV|)JZ(k>s370UW(qUZSWHqr z3J)>3zz7Noy0qF^wmZhW@aoE_gMYHK!l5YaRf&`Pix2O6lx8VWrO$E-%7YD)wi?&s z<`SY#OP^tjKKW{OOutGW%#B<8*uUQ*M}Gc9;bt6ShJmOrKgPd%NZp=YoZwWBL7{Yu zKIh$+dC%2nuYBt$9r>;2G+d9LqlzO>ywVQai(@)Ra=9uW=tP}T?>b82Y+%<|u;&i^h$manej(7Zcer#Zhh|6hZZ}@bw{6FjsU*%CtPEJIxS)dUbpEJ{6 z5)KQ0BN+84%6CEh{)sbiC;eC@juCS9ak^##*BGyRTO&h~92AffkHSNi^Km(LR|-<3 zry-!-R(C`wid9}OX_b1F2Mi9t3|kqKeIV^tvykxLqK}~$dWB8#Dt#;=pB06Y0TA+c zQ5+gxhjUhn)p-v1LJ$U^+iS%#Edo-9;GR2TP;O9gxYcq8W|QZgC7MV6U z%syiz1Z}~3QL{=)%4o4WMW+xJFS_O*CvhkY%a#cLro2roGVidq41rLgtfv3>PUp%> zKC8IbJDzBf*9irazS#n&ewdP7=Uc(JLl=Hk#FPz z;5CE46h||XGGa5s(rq^COGF!0B(S}8J#8oXXmr|4I_-@MjP_XiW_r2cBv-@=uj8^j z-y3^>u1@S_TUco5YF}is-RYu!x}Yn)X_fJDzCC8h-Ip=ifEbSCfz?ve#RTc*I)gKC zIid2Btd*i)xThn(#R%?efEi`rLpwd4sDnI=epcWLUZab{{*nR`7MsQI+$~SjW$@F6 zXR6$UI8E4||HGyEw{TJgoC2Re)7H$vnB2LhOLmLjB?59p7#1bH61t!G;PVgPqb#{- z?AobaGTq{EvgmQ)c%859H%J|PY&|X<=VdA&Xh$z#;58_=-Pb*+FqxP9mYc1a$tr1Z zV<>sN#p9z!x-j7tK1IaYQ{5E(AY7#G4QR)b+pwNwZ@Nk)Up}j6Kp7lKUbKTeVsr%#q zAbG8(D#k=Yf@p(qNDRiFJU8u^Y;-@NWcCyIqDXY$6JPXrNRcar-r)o7yYK4vX=R`Z;=VW#SDZA#icWx}aT!m% zGUgqBSFOkf9O&987~XovsG_cWIx%Z9;lwY-t{Wdxd7a-EOfg#bdW_$NY&KHyRTzTb z70E+WQ=`7?aXW%j@4P3KptuuZR?Oym$7?<1mp9MHoniB}37}e?jXo#IPG)lzT*)uY z>5{FpnA*u4wrP{ z6fZE%b+m0yrr!5nIHlKA=+$}3b~s111?t;Sol$>^$)N@5C=m;eoDjBX27dmI6K z>~fPnn(%riu3eETXds>_(B}7f)L3_;PwIops&$G3Fx7*$ zIhT)a2!{pmAg7nQzJ&>)!bsMOY_lHF5Kf|M*1EHKZ$7^_S#H{0Ql2N<@}i%`X0idD z)az8dv4EkxYCa-WRVeS zZcdVZHT~dqI-yT>W0nSF=ed`s+IPzql#9xg@(F$y|ve%Ra( zSD&+JR#9P{PpP()=svYNI7!I(T}A-ELBWoz1>;*~3!k1ByH{$b&vjs{giL0amQAJ_ zl$}_}JDYX(Y$#|i$F#a%*$(yBqywBjQf%es6?j9)G|E4e?cR@0@mN}0dxrFY+_u*F z24$>ho=N5Dx_QgD2O2+(<8R}Gn2-8rJvbNP%=XPP5yat-R$&?r?vE=QAe5DWFS;Y~ zjxy~}MAxH&9CNlcO8&NgmC%m`*tXFDjLVu@6S3OElI47FqYh;2)$gN!YTy1HiaN{2 z(1cOMwx(_vLo&UOC(!NMn>_Z|xs7e{4_>LcXgUdScwF_*;z@Z8MSsQn6tT2tH;AX# zsQKi#Q5Q#f=T3@Y%uSIo_U{h~kxu&m{7D6Rb1Hq+a}Sf?L{*;y>Qi!}7F zY2)!y_^edN!2|?WdFx>1)$PqI^qR38HNA?D!-^edRfUs-Ia;Q)4`Q-BFMbxNiCw(@0NU=lpc^b?-TT_q&ZSPp(Ichq;_&v`HYPt~Upf*N= z+j7GJV(b79)wY_w^^~WyK^qSW>$qvM4Ox+wckc?viKi7j=5%d5w2;nKdwg~Y8s=;Z z=p3GB@yEEH0-Yaun6a@OYO1#%9@l|}PgcWp;G!=Tjze;N-ZWIrq+LK=wKqXOK3`}; zRyJ0mQQbNd_KM_O-&S;Y>qG?-;;mOA2@1iUm!vM=CP!%eG6874^VAIsA!dzkR|SEwtQ@L-q^pBy3@+Gwd8>6}@SCDYeoY2m8-< zkKrz1HQ4|fPMX0&jcw~rbmT|=A|mB;f;%^mC8+ph#gZALw@>?T$3#Wvr4qwlkL(v$rv>LoW{y2C74gB}%? z&++}Yxv!qKRlbU9Gt^Q1`0y&l=AA^?hX{qqqa4c(#LOBj?BG~ynpE4N14g+lx0tS9 zX)KJPL~MR<^THi{;od0uIr5$%c%8lT8phZHv`x=_`YlXL>py3B(*aS}^~ykoQt?yy zwbx~vY{lV6XFs1DdZbWk(x?@ab3+@nJZRnDn{fprf~Hz4%BS zl159&I;TJTy#e#VgX?{B|INIo{l0ET-Z7syq%s{4b7?wu+7Z}vlFg0+Xg(ih36zRQ zy=HNF-G?iJl|WQpjg2s!PdkK!n@MgRQ)~)SF;#=}*9FD%qif+g@d1O1d_%Y>o~;s1i1U(J~Cd8=zz& zGshAbzg}=tO*`$|taWDeGOnHAe)BKolx-J}toaiVB=bv2OJ`!iE`F=VqQ{1DoVc-9 zxXV6+QuK7*s}KoQ;8b}%^{gBJYWQF39(Y4D&vU-4TyL$@2n!AkPXq`VT&}-NoT!o_ z+6{2yCtJ**3H(hK6?L%K+K?26?*{fBZ;ewp50C!z5g zPIP@!>E`m8!%P)$z=J3eTmL!k4mgmoJ?g4t%JN*<_h!{QMeR=&jg-ZgG3?mS;B@LC zj!}B^E*a!D8H^XRF+dI0&a)D^-FS+1YmGY`jYn=Vi2C(s5`Bp+M3=j*gw)kZiMS2^+_;VQizP2~Lbl5%$d3r5YwhF8 zOpY{J^_u9O^|d0^A551FzsZ&)dFPW8kG8il-2Ge0etTUUlj;)C{BjXMf+Z2pI%&6h zC65cWSX=xPp@ZZpvfuZsOiF;F(u!Hf8=A3um4h!D@5WT+k6T`TR z+(|LErQzXu829YG1OS;=J^v%q35}jWXu!}hN$VSh_oz*aJxi;rtF65fX@kxT=I1ER z*QNmrS~)>A%9_R{kiCMW98=;{oEckFl_ z=0-7T=kk-jxNY4o`3!|zOsyO}rBhc~Hrh=_>TPTw%x-X6JJm#y%X^$xpxFL%91&xLQep;peh;q>?a zj-r#rAo7QWJDvO*-O()MssUH@ zE<41t>7t7M5do_o5NU+HzdPQ*cYVj>S~oup^l>Up!D#Yyp3}U$rl0CxC;g=U*4S?a zW-~-R)@1{wOzwgI5=e~Fc5UoQNffc5!KR#{Ri4tApPM;qW-WqBvPIl`U@v0?D^ z@>(g^zh&hY6kI$RpZ9FZzG3C;xv4*av_X8>@NDj(K#jy2j^IjRz-@T8v3ol(A7;Bd zV}5>h1u`VV)aySa_3BiYm-8(_NZ7UcSz6a!EiW-i=)cUSPR4WwrXxK8tEY|ajI2PV z`abz<_kV7XTioCG;&EMw&yj*dER!@PsPn&XsW{VHAf>vhpTJAMd4^BDcf&e2h_^`(@F)ycT-55GbU2Vvuo{&+UV z?`kFHr)pki#X>nvUIkpBK-Lc-4}d?i|U?ryHGGhKC+FuT1uk*#<* zqE?mp%qz|`dW_et9$3!)#5NsB_uMu4r{#$a_yhzYA#h1)X~W^J_r#(V)xqZpzXOj< zIZY0kXcIHEjUnhkID75$J{*#gk~L1S<)jmsq~%oU<+q2;|8!h*gosyFm{Asw+9my( zNv#PlnnB^??4^BXoRIMK1r9OiyoXChzn@~34est48`(c!ZS@I4ztZ6lO;;wdJ99C( z(9WR`;HYV!OU@}A6Boa~e{jGHL0a9bS>V!5r(`HnD=dZ!&qXWU+nx7Orw;o3xn+Eu z%-6SU4+k6DFCjrD^{O`QNaVb5@nbB;*_X!!Vz@pUL&C{~h~@7R8U5mIAL|e2y+{-^;qgxA9P#|TxE_V1{Bpd% zpHi(vo4Z>n?kJq%`Qro*z2K<<-%mT!d_4y_(#M_WcK&v2y&(nnAB4iwg!tqyK4WbJ z>U~gb*_qxA4rj>avsMo@xnp%}p}z$!N~r}rULBS6m8&}O>GoVC$#Az(_HU2YYrPdg zEB4#Ml17heqw?A+bCqtxlkd&1Zw2*q|MRS66aRk60k4@_FAO0U%c^UWj{5F2E%gdB zY;QilO8u;C*CNCBpu#_dC9uWwaA{Q7Vq;TI_@9ySbbyJMmWJ=ovrup`5#}mg)FwUe zfFe!D_*Div5qs(jc+{+|t!+z#+w(6B5g5;!ZyAd9nx9=v2J8o7nC`>E(SLn;iZ6QA zz{PFS&76q{78_oJWGklOys5TEihZU-2_MKP+x2=}*KRrxg&`Fd|MGlR5#Sk$L~UA!MN0Ed2ZE+-IHW zyW2~HB;uB0-5cmZS4!nd!@TD-x$a!LvOmhoQee5+R3XL3F;5~OGYe|GfklvH{!{G+ zFc8&FEDL~)W8{ZI{!?FNV+cj(bPru#&wW=$9WU06|8;}Wm;5ZuEz-@6`>5im$$t@#OsNO{2b65EoF<)He%Gxc? zn9I8CZZi`wznw|;!Aq(hxQ;0uyu*zbV&x&2(@j@5@9#dco@~Hw75A<2zLAS{b&*ZM zW`3I+{_NjrjP6~9%nzoJ;F@^HAG$YIj=NLyXtz9x&po&q6p2xMd?01NRN!3%ThKN_ z9w_HIdC?hPv0i6i%6PV@qY1~mn#6~{Y0b6&{Jz(IKkS0q~UHg+x$Uc({yT!%0abVU!bIQd``NUj%hlO>Zy-? z858~N{YR3=OsWk$$uDf6@eDT8@XFB94xyITr%7lQDJdc$KyQ7;74M0GWx(SK<9+0i zv_&ZsWu~B%JaXTLv}~@>tsWKV17#>HEB@I|L$G%ipUKe#(YS8KpDZc%Uo0pp_Lo+d zComncvzwb>Ujwpd@%mzT|Ky`D%`F>1ThYgZAf0_MOx{#V!7` zarb=Ds$`(vpa@EUQvf3So%W6Z#~jM+ow>LvwTN`@e1mzBD>8NRknsDDL;-<8G>5vm zn56H#8wEeyapMwkq{bZmxycaUv{ZTUPraq*LWr%FL}UR+5ni#O&E`WQzGeB)Rd8#b zFq+yeH1IVp68IdU#(s33-7K-61oQ| zx5tW#inb0g{XaKX-)-$3qaCKJ8t-oN8JZ6hd7WOrqhyJ{dphRyw!R@oO-o$ZKt6PP?TW-j_l@B%71M`@1kd*i!4HJhsG7wv!wV8 z3KF*ukR6)}4twx$DI*~X>|N1P7-il!?gY=?{i=gs;Nju1vy&PMagY zqUnX+k{no0517mVl86buD0Y890VagpprAcpyUZT%qD$+)WH;#RaG;^PCJ`oj#H3!z zl$sU;t7itob_256N&fjjOnwW}#zlcjKJVU*D*AvNsYlru7IZS6vjbdIgD zB8eCVg~6XE&;r#PtSwbUYr_8E!2>|5aF_@E4?7U?-B;izO%(CqU1L`duP`459wEv| za?yY}kvhY*-lPJx63S?X(NUe)d^kwI2g4(w{c2Cd+zpCmH3Q{(oS^aGz3u68*l`G( zJQ;*7KD{!V>3}fwon|KU$DBI~of`ap5vW29zKo2_%77j*z~`RA>JU(oAD>>6U%|U? zb-_lDT$U`WRD!1^`Lk^M_4_Hj$fTT$wJ-LkEZ)DVu|*H@r1FA{2XiQAV?I+Ye74mJ z-0*%->nHG&I5sWx$#Qt~q3vT|i-GCG#rd!DYG1sn#HGe0#w-Qs-sptjnNZ2?EwwqA zp}zcQLp;Bpu2=o9K8t`}_kTb^|JO~~KdlA0+y6`CJ3Xa-LI?I4Ep3qxGe_7})LEa8 z7_fE~YW@4FdLu67cHj+vT}@3(`!es#mwrWBaS7u3nUJ=&b`bVLI5EvwX^k*Rfj_b5 zL7?niK#Xi`pqryV+NWViTVuEWz|yisxhjQ6P9E%Ydr4tn@YMCL@e-I!Dt7;E|ITHi zuTZx3+Wy}(_ZyeDt8w%+!sI^eI^pzTgoZblCrs}Oz90Z^J#i{3y^(J!sN*H`Vpsh) zZz^2P>E!ba52bKP@xphs#*OA?n88}5UhBKUBKebFz3X*b2(VTVv?mT5gJf%-@D*wy z>|r(kho{-3Cw5E(9cnwAVfMc=-dRBTpD5tK0KQ;QuIy-vf}4A>PW@-i%8yUgif7hm zZ8DuKmKJws>xsF|?w#&T1}-ezty%p*l)MF#5EpyU5R(hoKTNB4>;&vyxUGv{T3Vx1 z$zemS2cIAQoO13f+}Ei*xRd23_&C8tf#8NB6l9$qZJHK_}9Ni#4(W)uQhgA}_YLb+<8T5K1p5rJ&HW0Pu0EZkbCr z2Z$bw-*s|HN45d{6+uJ3YhYvrpFDu{WDhEXH1;p(_kRyM3ghWmiU0ec5$@F|^z6}e zzBxIxG_8{%JLZ}03xHS0f}Y4)Mgs;lCD-+;>7nz}lTzh(jOpAgD8aGP8D`>_QaRwN@+YdJsk5?fAJVWp?1 zgU37w^`^W!I6Ld!nXWE#Juoa(FMH4#N+c6ss8OVp@bB-Xklh_FB#!uRD+%c#5z{9+ zS;zj8!%f8F({A>r@V~r!2+@NOuxUQLID(Jw1skny#t@I9m#H!bQ_Vnq~n$_LNdGE9cpb*Yh8K-2@ zD%OtW*D3QPS?j+*j(esi?vDd`nb)h~c#J^L-9CaI z;y+nG8yROJN;5>AqR~90x9*P1Io?!Gk=feW4PDqERvdoLlE`AK2k(a!PI3`@;*CFF z!3<^fu*E`W851{qUSY?_EkRNv%}%ar6S@bq6`BMPBs{q4v?!I zp{sY)JUp=<-UEy~1ZkMF#7i zILY(H;m;X4a+Mzee9>t%kClqwY;yDg=!nGP6+D*vs=*Ab^hH3P5MY7p^@!3Q0(95t zWGH8=PTgNfO6XGUT_N0m@Q}La=6?2}Reu?Le59T4zM4<;lg})Jhnbv$0^e)?4dPc1 zP&BDudwOv>bb+L!N+j$&dTQb(0oohI`4{a zJ`#r*7DGTOl6x7nyWs8ZeGVV^TH^&L)U3Sa+7!@jv=8h+T{v$0W6~pcC%245=M7xy zhGQXqr$pSAmb-_0ic|ej!Gm%;p$AJ@2v_htS0ibR%F_ zWtPWh=~+gAZjb!@{cSHtWWy;$@MwduKBwVsKq|~lC0+E4`v|XQPV;>UPYpZe^?T1l z%w}>0UP?%C+MhqhB;mgYCZsqwyJt&(ks8_c!8-S=F*ZypL1Hy_y$&*)#t54x3)Z1C ztHJtO)4e7p<(~xDyYoR={wNE*8!A7wle2{%fy#dPs9kw3h0jL6ZGsC?JQUM~tw|I2 zxC{)MkX?p+gE&UuQeQZ&uKplq+&ybTyk6gGpEW118?-w zKBz$B`S}~;Gtw!}6hpZh7#e4y48_*g>l0tek}J>e^V!TiX1Uo-6Y5}ziuaHT!Ad(S zW-6g3*tvgu3;espU6G{LRdnJ*OPxa%Zt})`$va;?_n1dYv};-0Zq|orm)_2=UwvRw zEhK)MBF$dbcVn0)$WJ#aU{!*Ki|aCq{8l;rKKMvFp7r-N2D5_Kf>%W7z0;L490crd zkIN3ch-dajzQrVRIdRX}%!zIMT$S3dudD0%SueM==+^6|2A)U>v8M)j86P$eWQ4&I z*xNek*_!wwB;&r1-g3;l)b-k^H)Dqpv44VA6M<8ZA{2G%X}vxlgMLJjXOEhACdRYr!<;t_q;-@pRK&#GemtxLiCx<`LrZ7zJu z>(g9vqMeFiRI*k$x%0=>fcK^Ov!(D3zK#fu)9p^s^YV8rE)Jo2CerBjaydgR%7`vR5>w{nbZ6H8 z)XJwNILrSFn+J9U5F&e_VoHx$PA;dJVzF%l>>*A%ECU#l4bKj%_zprt2_5w$HczpR?btKI57o{be0}#%PsPFE zD^7ns`E-gCs`k2qYOsTbW#8RK?>3nX&(PSRcs~zHTX!j#YSDqj^eK?x5?-EdL{$Dz zv*5W@Lm2a;gELEyUSwPEB#)X+R&aqaDQt;e-rzcy*|q*dk?S(eUG899Q=oZ`bR-G% z0Wt8GYv{fIPc{NRO`#_;?GoGO)&wgk&fw)8hs0n@uPR z(W>)Y1zp)#;W4_y!-gy+*t(!+qq8={)F^j<@)?JM?m?YLi759IV1*k>Uby+!~p#&_Ew@5+B z1CIf}^Bbh3N3(f2;|3!IVFsn8iAZBph4)iTEq0w)(DdL%vxrE|^^98V|2EajvA)4k zhR>p$7c3^IUw3DqJ#zQlkKPZqovB@Zntj)dA{2t5Ias1okD@fdn4v{vAc#W_;_q%y zuKS*!7nVMk9Pncv{s2t88L~Gc9(xiz*Vkm0vsGB)N^X7IGvvaK#5X8doki0w;GS?S zF5(iO9T{U13y;(xHc8pn6H|F^&HY_|^J@rT@(&}kdchzD&UbmfOUuchIrE=d3p8&O z$ORl9>j+l0Ck;F1_%ufr?-aC{lM!u*9xMfxGi zL&LQ~J-gdBWQ%7Q1cR1|7aArY8!gJ~w4uZc71oOzRHLbI4Q}FvXod*Mq$Z~=Uf}-g8*grPt;P?0UkHv;y)cqO% zp{A$30jx;AIzrVs3P0A)uSy_`@GX`TybO(>D@sXJx;b4b=N=m`3pJ|@`G^WT+`AI&#jof_@U-6*Si z-_De~aGU}o_h~mJe~=$9I8O;6^R211N4cIA-qVMfZy>h&Us9*LyHQPhEBo%eqDEl! z4!vuAT2VAo_Sb?If@6i(s1~Z%fxEMUA%Q>xzXqh_`LP+RvsGTG*c75P7xpO`%@80* z^9AC<9yf&DqWuFT17mEoKTW%_MG55zFuw(YNQYpL+a-D*{r%D?Zl@MiMQpRu8KGV- zQPHL`E0m-z5Q_(7Z&stl4@kr1hD{oiND&T)>2F{4LBvr zZ63&TlYT!(^Wh%B(}Rs0sj@UKf3hmFMVY7R+)|HX*;F2Z7^c|6y1G37#zqt2xR+5w zr;OLaQUMED?Li=5!8w*$2M;LH+J3IGESi&ke6$`IcH1IK`)1r`A^_xbWCF#XA^QID z@k`*S;j}m{;YKTU-*HDw-aue3etB$d$VSQU$}gRS-Qw%t>+>TuEL#Zpk_+CKe-7SL zwG!0ttHW}IrzO8i8IzK<1Is{iWSn<8T9B@jG=>lURSn2?f%Km}{uFY14a_#z#vLb$ z>&gU55%cO#wf z)QWht>Fw%xU-^>|is>r-qdY7oa7M_VTstl=bdOP+0|HW#Dx2xh6u#mdz$tVwvItvyIx+9b&vnR^UasA$(=0&r4A=uab^4Po|t6hTiX0SAH zFdo-ra>VcpBzdd*HeauKIIfRinGU2S)XuQ7NUdE5hLzeaf57eXM?G%+W#E6{wQTdI zK>xl4dqrpvO)YL2X!|6CsI=sb6fk($1}X-Jr}`ZlEC77`|#mvV3*VgtK6#*dd%zhOB4!Fa-Y{z)hs1Uh^iD}1_p2k(dQH&U~|rBedC z?B})@I)c9)el6TCErQNUb#mb6TWX+NC2YPC#8)8oVL5>SYxmo#ZUY-rMkH)6pUtN} zLIQ$@yM-ic)sF?-!iydgRS27$_Xv1z8{KdEa17 zT~FLM$THw|I)rv&n0%RB3(ThzMc`BqKKc2vFB(ZliiI&mAjYHyouf1+qKg1RsarRR zv&y=Yxz34$7$78)-JYvOvhp^IY`)cg1Zh5BmCF!PQ@rLj;+9Kz7DDGN3lD;Y2XeoKXj6{yOhrLmeR7_B^yhVjj5KYqXHuEqAjn zEq2+PWNBUONnr60C;1arE#`l=6&5BiJ7e`Gb=ZAi)!D2Q9PEb-&_DKUeh=QTmMU?U zmv22he&4kP%8wO6Kg(mU^~bfRLcXd`IT#N_g8Tv)wJ(41z7b z@R?m3GCqkNcdWqzS&Sq&1s*BQ?pErZwP`|!%8YC>-W4uY6zerOUfk!nSC^(+YySjWdLIrm#RZ2> zWaoQm@-wvvqrf>qkBf@9o9D(&% zRXV^}5Ob^4c)7Pc5!JpvQp^Tu>_E<828*k>8pK?G@VLoL!q4CCw3h^vLiiqVt;H@F zPzbt6oasWI%Jip>#mc?5EIwAr8+dsXUxfRZ_|sw@w$J9M&RX$JGZ}ZNmS$6}vqf39 z)L^p>Xnv)gl1duyTAoS<$OH>{s=E?$ z)G}=(j&OfEJA>M)dhREICLC|5A<)?QBtH1Yb1YQc^XG!jy?UkF2w(({&~pGxm;nE_ z27cbsj**R_`V!VkvTKM6!O@c5NP)n?%u$5?WDh<<>}NK!}R+4`&_iT6>b6>Dy1tF^p~9(^{Js zCK8LV1+$3cs+Rk-BP5=EFh@e-Z>iaQT2+O3OtDJ|4pVoRrnL{Vj0iN?X^GSv&k}(V zRCBFcCHA?xR=RL)=M*R+>`68mUE6R0rhTO}L4OPB>$y5FGO(|EX6nApm!r2nPK7o* zoKem=x^y2tVadrZ!+)gaY{?XEJ_+9|dC$Z+-rgGAyC#hJDS^?&dZi}`iG(}1n(~6P z!SrNnG)~`vh+Vs5FN_d2<4|lqFp3Nm2fnXCn2_9RXuLRS3`eIxKB+AgtXnV)JO7bw zlW@@m!Yb?Jo`c~zhp$Mb&zZf-qPBb(_hSk7OXmoFphunKMy?`=^`S$e2*-a+PHreN zWmr29O&^wPq8qN);)?=?4SEByGkbs$9$$SUnwho9{?IpdUTlYNE{{*j%Xc|s+ar~CqeNx)0eQzS(s4RF&9Z2VpYeOf~Jd z|Ks4ReCgH?B1bUwd`tzkzPWzjp+Ne^VcP$F+pGnfT8-GkLQf#e_mh;#SCKKij zCiaK$9E&xErGF(p^~cl(N)i>;9SlNtae&8m2M;axUJNH*D_r!Qlw+UIt~9vlYL&-Z z8Hf65E&vE{3F2u(CE%%^$G|;kRlVCdSRn;_$UI|J>eZJ@kKw&Nhz{g%xQglg2A3jM zr5@vowR4GcMsjU06GBxGiSZ$S=)Lcn8qeKk)C;w5p{%sS-{f69FHWjy-o={wV`)q| zZtS~{ezcX(xF3W~*^-B~RqC=w8O!=!^C8o4!>==nqk$+9md(*`fe*mRY^2+c82OKj zZ~CWy5-6n4)Ji($$;Obr|IvUA%+@nC8`xL5rv(;~Mz#Vnj569>+58c>Us5>~#pUG> zYcYtva2cVz^J*LKisSZ}QDS`#eHPxCo;=m52slES9m6c-{hdeEnZk-I)?pYH+n$RM@w(tpX zzP}E_7y7-9$rmm*`p1hbWWrWwkIBrk*2Fr2mjv=SHF6H@kt;Hl-%DVX<-#CN8kim|D{QBkR0gVGZ`Y*IH*xOTO?Pd*)cwb(q?Ft`yy^T0KTs#gm)p^Dqf{TrV+59?P zn50Onx+_3-_)D|c{l+rjg~$>rhC=!Bl+sw2(v+*Tt_M}ecHhZgh56q%C%kF8XE9!g zw|FvT5w}0eHy_dhpXYcGdWa{5(K?EzO6F*=>>S!rqffe9tr2~P;n{!A4rRlcCf}9O z;Ugo<^LQ7;P|O2V-Z={g*TTh~Q2BO?T;j}i27<#Eb8h3YEX=nWb+3de)RoHOp0AbJ zwK_KFv3CD{4!7N^3PK!1uY{3Mzd4^z^wQ72qx4u9$)d^f{PusatT6q&!#n>SWRb=VSBIMctMbEneTdPo z%=;77rsGH|k26r5Qou0^NY{s#R~Nt6waUBZlQ|Bo%kDvmMRXIqJgh@`I@e&IBGz(%izL}}=k=S{U2&+@5V>Q#_yapEPi^z`l?j>F z<&0d!w{H!T39;FQ{z!!ijiC(F8*gQTcG)oKWjvRn`^kD}97}u_03C@Tc8?C%`VyD| zg$&4&=;c)byorn@dil=qp3}~Vc;UgZXs|)|`@d(f1D1`5F6WoS248;=iua^HM$d{?8B%W z?&*_fsyq=hd;@(&;k5Hok5&PE&I3f)L`~n!U%V=e_YZg|Bo6Yh{etonoWio1h787I zei#VLaTZKB911Ur>>uxcS=I1rXx!g`>n){CIpH5Woail{?Z(#&eR!SX=Z|_O&pFRN z>)tcc3LlzRURraj>H7ipGa}69f>v?yi7G3W4~iKyY4O}9uq=%*)t}x4c+EA=C*NoC zfkpkd*)`a&GA(Nlxm)lu0=ZC8AFU096(&8`1lW0YhZ3>a(Zb>~8jqC`f}Bey3NLN4 z?)%uA^rF3DS$60szlrHx%`X;?ceRe2dh&NIKbsSh_tc(wj@K~j{&**w7xL}agKIq% zs8_q+R!ok4Z_hi%k#^dh|Vg2rC^Oz|UB_QzxHCjRc=~BcbNJ8Z!K7z$;lhlp>1LDB(j1 zK>=7-tnc%kt&YE`r4FAXkm2&fA{KA%K<=L<=k?9azR{H;csm$DG?KN#w+^*+UcGE% z=5WfZ=ODX@d1#~F(bAN(stf?m@#@Nk{A_>8emtejb7TF5R-~)tYJp~Ir8!UH=iW4c z`z{!UWw+ic0)&Li{d(Zf-eiMaqiX{Y_~MF}bU+ZivqQixXX?1ni# zWzNFNR6^)ky#BLLRe05;4c-S(ui1L%wkQeI17H`lfGZV7ZIllO1&O z6qcI>1KN(h?K9+~!FI<-r`uJ%8h;`;?x0s|dgqO34?iQ*Uhl#x@%CqM?RTjI0)l|k zuD9`*2O2hxxR_aHZllHR+Cb($W5%BI*_;l{jb=#{2c7$*#WCtE<2oso)12FRZ}Nw~ zatRtH0TCV^4W%SUa>{PC@%O~)YUrbTt6y8r5&fv>9$#t&as;$b1lE$$*$6(Z#KoPuu=eBoOJx!hV=m=S;-M zMsFv^SKlTTYIt5?XZEM?tE+lAu^<{CqwfDhBu)&?R zs+-7nrKAl{ezUEg!XLk$;hwHL5m#UHSA#C+ujUMzZ`?b+_8?ZA7lg9=U`uD1Jbp~bWw)?ys2d-##l<2Po~!fNo>0~gWD8y&E;ES@l8PP^ky^BfNf|BDL1xeim+)t$b2{-N^xu`8C^AwME2z z{~$)qt)Aa1`F5AbNe^#s8tjHCz$ifYy4slwsIubp>WVQo!QDQs=x! zf3!IHi|sP4_&BZaXn(u-nBdKJ=8?cv$&r+K;MW>sS{Tl3@x~HGiqMNrgMdjTmMU_5 zD)E`&Ph$({l>g>66f%7+FH)8kbaCsU7}K8%MIW{mI(rV$NEjPLr@UzeIl4WtEEuL%vPEj zeb{!@HQH-!fR7PD}h6(#-7V z+Fd8C_CjafCzuoZz`sX%Bdns|?L@hDS785T;uHk0Q8xEpU_H%SCW&GE9b%O?D6@4ysKa~tY)pDBYllA ztBGKe=kUZxk3^@fW=<>1)k^XE5ciJ94O;nCV3QPvoX?asrXImCuTQsF>G-@#L1Y3A zXnIH!cS-D=6(g%Hn*5uPP+@;+m%GE;&dYLeiHSeGI!aT!g-o2i0Ez82@oPPCEGGT_ zKYsiuKFViOoe+ETDh=X-z1$Vvp(mg4N~X!;Z0>xa9qPDIt@aqD$q?@SoHa}1&g#d^6GO&|_ArT0C8GUlemj!0Dho3q< zgK7RE5-C|)j~IRxKO!Emw^m+Xy3YzG;Ur$ZnrcXF~bBv|UJsJ?>Fn0KDdFb+mTcu#H(G#uuTI97ibJ_B>Xd!=DEBUf- z>c-C``>B*eo%ZywAlrp01|^pGtzL8PSCdNfb-}TbzgO#OGL{m?%XVgTuN@qV zg0=EiU*N_kt!9nRO}3)fvr~5bG}^LlAE!Wmz82@v<>^QJ()N4`y>=~*k$3U?nUk?n z$`eKG$e(w0du$Ff?)(f2pKVyY9(d3$S=kQiFDIo<7;mQ0G)s=X{Pm@0lzc0Ig6Nc% zlLzhM`AWQCCO^(2od1Wbw~ngv`=Wlo0*cbzAtepc4T6L;2ugQ%96F_>L6Gi7q(i#9 zQ)xJK!=XFg&F>xK-aD?t{~QC3&)Lsjd#^R;XU3eC@ih?Q3zofA**o7#`J6M=jQI2ik8k1O9S}TQ#b?#&^@9%C>;_nX_Bj?pv$ zIywS?0P9Q_*wQVOMnyccW=D;FgR&v$J@p11t=E#OhN`nm-i+-6{^s~$-)6K8U@Iyg z?R`v#ABN4AZc$^=1x_vtA_QTJH*MkVB}%O&f64d#`%VZ%R9bBWTn6+t265gg2Iqa& z3!Jtf>vIxtJ&eiRoh*pAibmE&->0VZpTRYcT3~kP=i`^wMagsC*kKz8fW2PuUTrZv zZkH`!QB1Ez`RveQa#6)h4E`*C`3d~<$sA_=bLG_T|V);Px%`q&?*0Y4?gsvdm+{rXD@ z6X}L&{};7$@jM+_jl&13!V3;V!8#d(NW@ovgiME-ShjE z*p|~gSfSEMBUK=u9N4ne(~~&Yhbq$N3FGBWu!45NKZqgKw@sHJD&1*E|4jAGio0v;<$d0F^_@v>T=zyjqbt`S3;xa!zE+kR( zt}Z3S^ZLg>5#kp|Hj~W(V}mWxxjYI6f4^JwAEuC|Th(H8NYgD^xEIuGyIL}O`9By7 z5SQg>AZ4vDjL6ZIn|f^n`#ccbSCzYB9_Tw@Eiqm- z&qGjenTC1Hb9`Y%o>F5+C`#;4zpzC#;jP{MRs^RJys4s4SAmeXIRvmj)(D>k3dFzv zwUvy6{>XC0IV~YZa(k6UT(xDE)&fx;>})h5w75CCRbU)|G+-VE zI}ypfzmscnK0~2=_PEn^5E@hKET_nP9v|%?5Qz#*>$B}qF~GkD2VWW#0!|dE=*$lu z)zRCYEshc>q<=;Bmed=)Oi&1C(M3{yf&lTga``i_Ldtv1?q$$vJic(1pEzDem#D`^ zd-d17@ZB@%I}mMKb2M219l-b#jLLP|qEKvY34|Pfg^0E0Za=lY#->&L$qrlyqOW3s zIFa5&1ag^HL`(HOog_v?PZv<#q%-nc9}tK1ziD}>0hI}rYgrl?y!}^YIzkC88f(2r z`%we4WV?BkG>2r!nLQX*qB$A=rU@m6}1hh@I0UJZx$7kO&-^wP)CDrLu zz}+`QoqFH@Ehe^1-QVzDb!~fU&z`kG@EA>-bGSLh<-A@IdgdpnFPee5TE@pd<0waO zBPnzqSVElRa42@+29ev5qsdS|SDySn<%Q$Qiou%?o4dR8@SIn}+(_#z>GSJgsIxv@ zy09alUxAq*6(bx#Y`g5lvpwcFAQFu`v6QdHnoK?hBZpapuJydPJ%4|&;EQ2IE7xs9 z)j5?fi%hPk2N|=>R)ifzdqb6L$OXMfK+eQ8GkVtxL>3KPEE4MPG{TH;aHm}tYHip& zFRh1lD%1<~=*M1VO0lfZS8JIqzJ~>*!2XJ^!%`_F;CvL;1W4i*8%?ka6mI95iPMg~ ztMPt){*%&7tI8@&=tYbE%ElZWRjt-q_l$FkUWv|2{{SX<+vr{tw+iS zwwiw3%hq?4lXQe$S|`|?Oe%UM#}VQRC-Md@`cF_v?wDqkLM^(G+eJZR=T+sU%+b;R;Dvq zu&15XGfjJh$!i`EhNf>{H!phbsMNK@1%Ev`#J~Ks9Df=Y6b`YZe+s6t%B{nkg{nPj z5B7_`F;mHyoC^=Nf7tK0Bx&|Q*>p@0YjYLIksZOKw6%z7-SyOZ^$Ly1K#_8!tx*Ch zwq2+7HlAK!sH+2^GwQX3*8&f*QF~Lma$ny1ql)CUmi7T`wf)~ZC59Dl8B5=6*1uZg z+MF%^!OrE769)I8T(d)Nx`aPJ+&+Sw^ihK3-S&R@hKR&bHR2OZT&d_g)On zeq#Oi>xuOvGx4-UwwZp>m@`+FQ2WCEr_sggN0^HkJw#aZZYuv1HqGK2<}I8Bi%G$` z``e`Bxx2lEcWzC+sk7Ba9Mfg@h(LOR<1kdEd_Gb2cr^*Fw7k)~?g?3A)~Yk=8(lhm z1j%>Zft6Ghb~9h^*}Y@cLoEzU5z~~0$@m>{uzMqc#hm5&Uf7)?JH*20aLQ>qmK?0X zQ4sQ0kJl~=A$z{wBHZaai*|FM#guOc5b9OhOljUXxV?B5R6TR==k&srK_g$O$wl6J zp&2dQ_euWM-^W`szRM#K{ZWw1%zU$?-6d8T2Z{f&bJkIq;291-PqZB_i*)l`Xk9mLRLeU zflg?}O5I$N!Cq|6P4~c0pkG7%p3FaxDMJ30qJe-gG2aX}qWq3)AHUG9sOXp<(Bm3z3p zxiuId_wDEr;affi3eVvM#m9pAZ8X`PixvwW=1|x`DDIZ#?_J2W|n=-+~+&8S9 zc$rZ)x4?RUJ)ubr0rrSxTnZV#gS27k$@vu+&VB0Gq2$(IClOZ+%mKluCBXneTRvq< zD+4P_ZSr4DO0TCJ>uQSiML0we@o8@ypVRi*uDn=49%=#ChC}ZE6?jS9^tGzGSZScKgu3IUeePEcqMA_*TN&F-(c%9TKEmd=D$4HL8pRi7;v&CFxm|Gl3C67%v+B8Lb3~LN{9~6lmQAi z%vAUzSiznCLE)P^LaW;Y&k#e9Lz@Ug(P=%~9HGL?p3PE!M)W;v6P`)T+FIYwcWI&q zCUW~|vQi^+B*Iq?>z2iT7vj9Cu{`6yUSif4cLn{gV}pRJAIoXMD#(tT!xYX0WMmX% z$bC_n;-x8EKNy@D8ZK*CSPUnGy|+rZ`hp!Cq^UAqD{S%a5{k;Am8Y{D1(scskX-M8 zA0o(CC_w$u_Dn7>3eP*WO=YpEK$bQ@DBHlmUV5K0fHB%b&15uAdM_$~mc)3~d!5SV zaNgMpNy7aR7;=n8e#e`eEK36e$c~8lYvJ2#Y_5W4>gzHl<(xdG7aFbiR=gE<<#JlQ zu3vK}+MZoTGAOf{tIaZ_R^$R62!et~D}89`G+|!JRhuP6IGYnnoPUe7(AQ}dB!Io6 z7w8_wi>1`$k~x(V64oJj#3*jpb7rW@rJ7%;=Fd{0B3!7(e3-I|a1vZPEqr6?(>j_4 zb1w>m5)(8=&9=RqCCv+uFSrdICa3w<8igU-aF`MrI_#n0Y}xjTw%zxwj8fFFLHD7s zmnog@54$$+V4wanXdk{P7C6gdyq0eh=#OCu4^hDK;6ZVPYnC7|lTO8akuA#Wb5?>0 z9sQ;J>vvxEClSxPoRBrSB94%6=-h@0+qp^diJfbE zl(TPoSfgzAx@q0WZOsDSVIEaN7xy`EO$)Eg@kUeZuPJ7KeAU;++V+H zK!pDq`SY&t#TzuGv<7F(UYUwPRgfDEX zbbbXSN^G8O>kY}7os?~e);$PHmOw13X#L=k4J6wOIsCb9T^WJEe(j6siFJTU_FEsk zv^0|3Ru%5+s0!3rhl+2uhbjf5A&tT;+r7IS(L_!LX320ZfD^9#6rlLt@thZPpS!-5 ze;s*z>MaZ9a8n4`i$ekDF4|1*AmInZfkihEx0Sqxf#2-gHu6LonZntZ2<7etYPnA> zlH@hYaoaE3wj=l|z2?~O*&#$Ev3&aDLjhfG7|2((5J>QFB zPUT7V%$wNJd(O$~3G0y%#z^J1ZNGwg2k8jZ=33945(rx3SWbILNjb+qAuNHm+Ifq6 zlq7+&3IZ?%h?d*xl$4!|RyL+M2=C+Tc|`%!QI1SJgCb4q&ZPI;Sqf(c&InFX&4w=L zf_Y1RMZUg124C2}tFRfIL|aMg#7yV;9NS7FsNOJM0-vPrf<14gS;bWP4b*I zm-wCp`EkazYa>iPr7mKTM2pVR2?lfp&_w%7^47Nf=!UQ7Uv227w>GK!ZmR(cFMVt+k~7-xO|oCUk4K8w4GNz8?W;47Lr>*v4q&T}92i zI1=kDbJu7Gxq3`Qrpqa}MnQD*3sa@R+EOjUy^Ry)nR=%aIE^=Nks4kNNgTVY*BM|p zUK|A6UhL`ji1_NCj;$*-yRpqF?x1!Uk$qwHygKAbKV~tM&^y1=9!}$zhP#vKAf}(Y z*YYFbd?w1GWSt)@zw~8F3syMvRjo7G3~Dz^m#rL4;wFTcjp4xh97GgFg(K;ne1p7y zGW`3!Uf2r1y0`Vt_8d$%zh{;=PP9F<@XNtk+i@Alhc-G{2Rr0SQF>mIfUC-DkBePe zKVsZX4gKvL;62${-lsj=Wlktl?ix8W;cwAy`Nq=hg_enkQ;^tF9=`8sTX#UY!^`uK zvyEFBHRYXZ7S?Q+98H3S`5pAdQfv5r(8#~Lkyf?}u%MrubMS3a`a(r5FEwDDr@qmA zuF1C5h^>oUIwSbKW5p}-)3(Pphs!WN=8*fAlTDjxI!wVq#;R578*PDIChKmvF~he5 z3yyH5)(z(`llA%b@$N@*Lw|;M2VvBB=ZlhB)R%;lECdHC(Q$yRDGV>9&c>pNQ$lxhIGe>3mN1X@{h>dUu!; z38?fkO&34nPapafOUb(*DAPyxXa%qBuKY3jEcxm`W16ZcbzA6<|BU70$=oNm#T8}_ zKK&H@Q{M@BzkYrg?&uZeq_Ig`o_HOEF!&$JIx!P}z4l-_tG6xTa0nVAjfLnY$EDsg z&T}-Q=`G}MGNO>=b*phJAH+CubiqGJHm8>D^rlS6vq$quu+(64C_cX3y;%|G^gi!X zG93L#uvprig^6kNYb04s0HhWJ5JKIpYz3T7dmA8 z{+r>gNhicnrXBQlO%#O6qbDv^3Kej0s`GC{%R!-)VB);d z!-T|Bw8&G8zGqLw&bC05mDT>K^5EI`!#6DyU_2Q81&9qth`1)|pdUa=fpjbO;1j+> zE~^GsKycKDQrA7;JYCwEYvCA87ovg@Z{7fNle>vQKs#DQv;|JR3(eAoP>)0_)%Qq} zfJu|tj<0dsh)c$dU&{O|vEQvW!BW?5{}k^`Hwe9Z5G1R!?KzZr;#_=lj3sN`t7@y< z8GE$wBHM3mIbAru7;|AjnORSw(r}8jc|S8?yvNG}^mY^csznV3U~i~;5=V5knZT{e zX}~P*MvU+*luP&>iuskt@+o^}#@7Qh*;}Lb!OD7@?i+z1=!Z=~lOCehz7W$e$HpvI zO@55@ClWe;0kuYJ^Us^em5u?ch9G)u`3RvTcOSaoS^h5Y!Ifffjl=#*S|o!7tR%z^eFA|mft`q7PdLbPP@ zOQGW{OxX%HB(L(fb~lj;J`jN)^zyHgGs!)8a7dqaqFuu81}$Hd6t8KJxg>9U?ama> zbl*eqWg9XR2IF^P#3n2n`MAIK^g0igz5lnk=cD*>2S4sda(}>&9p2 zSlKbsmsN|x?~xZh-M)+;k(|IeV`vNjNORpP4eQ-**uR!&2W&Ea%~os^t=~Pe9hhmq zId(tW=r~q3s<1yH2XUh&SZ| zP0|o-5Xu*Ce%&19b#vMu3%Qfmcv8QuE8%bNNZpzFMK=bA_^Y{D&?GB$DDGdGiXz;U z1K}qcpP1eriDnc5rz3_A1kbA-23ZD((s^5rkj&x^KWvck4RIha-&)UB=qCJWFcnt~ znqe4><;@?8^WM6N3_GPWFJ;osqht8$L0TtIRR_8IdN%itGrrD0wY^;CjqRYZeU$nG4jRO2mcncSoo@(z$zPZRpzAe|y%Gpq~ zI)h5hge2K$H)Ow_QI8e+9`S;&uH%83n`w@-$DnzXD9IImT&jQGc! z5TDUy7UyBEtsbMD#yFq43!#W5cE)8^jIG-a_F3t_=U1TB_a5JWTP$lD%A+K#M$bD> z`!Q1Ni*f1QPDNSW;cO_#llWmW{wSNOTj#=D`5!AVGf)kWUwu{!|7VwlJGa_yGyGlPkMWts3NQR5YhRIjKXFmnxZ^xiDf$l^#9L?m3l z^!hF&D5zYzw~zHLYI+-e*%#kOvVSy%10P(*J`|u3Dgz#JD7I)ZM?i}M!(iAKtChn| z<`?U{k2BK>~}3&`DW3f0kH~qp~hMiE3_r{%cd= zgTT`absyn-3X=Pii;MVu10|l0gLRS<$9Lol>;`GXqh?uhgpel^()^?PVdAA(-_)z7 z;ec}8KVg5_$}=xUM1u|mT%Qx?st&Y!W(HIF|MbP?`%35Ce$Vs@#G(n47GnQI2IXQB z*f*Iw{el%yHg^S;&H!&yf@h~^kI5!4Z`Tzj)zAXd4GLEO{n^;OK0)(bb)oo1|i#D z?BT|hD`Ap*Phv=f3t5Y*#^Iw1I0vJXcAwM#jL2&6({4BmG4Wf=KwbUW8ZbvUKkcsg zdK_uVwa47pVIUBUh(nwKCPDyZ#RUY8@wOQA5*UQP%<7ITJ1g677C}O~+8lc9NYgWI zpBTUdmP4rwL3a~3H&uGi^+dl_?%%M(?3IPMI_>~FHn&DM8! zcl|OwT)K8JKW<*t9`ffv>c~L2^AqtwL>#jk*+Sgxl2 zZ^+YL{k`d-Cf(8AA0#!Y3%K}_=a6pPkk{ya9aJ^VeKzi&hwh4BZoY=ytoS|HpI);y zf60B9Qfp2#Wf&KTywu$gx45I+Wq&hClbkZ&YJ`8HWO2Y!b-P?Pq zYf6>GD8-$81yZfCG7S(f|IVFZt+22F6a>pGZMdw8(uzj&XyeNKzv5pyS31weV5ZDp z)Mx}&#vS+W>W^cY{c(KN2;|J>w` z{OwQgx<$4uJ@Pke7dL9qtd3xE*(@0v2asOQ!d3_j^A1I;`s?vadtXOGjdu=z3O z_eg>OVD1hD>EyZIsb7(yxH+z{lzxNViAR653k^+rXEG#cBN?pf=jyp3?&{ z@k^@jXt5Iut?=KoRA!F&>vW_!$NyD)n#dvGbOP`d7+vlpPzQF5cfi}P-#S~It9C#0 z(@s!pc> zC@2_~bHVvs+Fo6I~pBQgJOuayUy;Y2152=3>_#^AUsNev0Bt^JD zy3eECTNSQ7L)piBE{p4?i_^pu+>$TtI%h<&b=kw@?qEPooXCx{tcih$A_I>5OYpy zii?tjo!6hNWxKZd(P>3loJ);QR_O@*?ce2-$HY z(TF2nIa_9F7p?ueQN!i5(kAbVCY{<=!RXO!|Do{*n>L~gyzX+ug#L_)tkN#bvJ@42 z&8NQp$*hOzX@jBIv;gFY3C5tJlg0xz-vUo%^{ewbpiO2x2`ri+Rx@R99b)s$$A5)N z&f{((uRK8XpLFFi`Cmqp3I8ylK_%jfCnQL+Ew2U8o+p%T1lUwH@#2#1%JFK~Yhvya z6Z!Ul%X*1$yd*9wysgI;1WYQ)nvxr~X25o&On7|h^9U4fYenJ6pfFR464DmZ0ts*T^4f||{c3L{{bZ~1oyJ-Y&M}MQdoE}MC zTkm7f5mKdl5||O;_%xZP5W47&)V@5MSN?B+VLAT8%gAovYp>XZjsP4=?D_fTh#tnD z{b?zHe`eC64?AUEZ1qO2Hk0dXZE3=!cQn4&$R|R=r?(^%&;A1Tx-}Spi8+s(<-8?e zvp!xZs{I?l0}XcvNnS)JYrPUfwTCf+5)1>vD#EA|TR!WoASw4nG?~!krLo^(=#^#! z!Nf$REP8IdidX#CuP0>)NV5pZ+|_ntWK25uJkNX%CXMRwdVLxj3FZ4&k_Mw0d{pS` zLy4>u(y@85|6=K7@iGys=RLu{zF}Byiz1#W&yuKq7lKQL)QbuW6y)RRnC7lAu^Hdo z!t=eiCoexSbs>M|PN9c;ybl`~+3 z6XLLFqHt^tOASn#I9WI4+_l=R_vF96&XGyup z_G}ClT^y?bR&TVCM$YwCT^b=Km51@cTs0yY*-=9NV6gu5P|?rA7#8`@#&U^^RjQR( zly3IMa;b1Y`QPCOf5#;2O9ob3W*C=5hDz(j-ZJi!?Q)pH*~m3l-)q8WUSr(g}0GS)>54oRF$$Js&O*G6XWspHwk z^Qz1nR~F4JEvpZwG@U_zczhZnO^Vi1S2_YHJ_jD#J9=1ZoSp&Q^F0n7+_im!=XO*` zfGGT}jKtq}@~OF?eZlS>pF<}?UNBiGJ*0Zu@e5iJnm36Kv@g%K{oyX<{{mt}u?}CK zCW0e^4=>IUI=OIrQcTxGy@8H)n+vMRV0_>C+?mpEIgLm_;1_A{=y(Ad3Y`_0MoPav zDIYAhqK>3;b{`~9as3ZVrHs=<4)_x{2I?vsJa+SRFL!xooGwZD*o?0~aat{ZmT1bk z{+QDj;N&=YQ#}!o&E}s4q=f)rav3)4y z8YpOTRT7UrrO2sl;P!}gsonxbc&=;P3-g&u4#5^dHn$IY=dcA39+9c`;u(D7#pY$I zt`~ONQbF2+)_Ex1)3jE2EZOM$B-|TG7=&h{HF&P_XP>J*&w?X{9%32@X8cw8IdqTd zO4(xB_A!P8A{IoyAHHDsMUhC(C>#Q`ja$92Z&_3l@Ui^&F>q|()uzrl*)H`l+XOne zx{m+eagyB{>P^C`XIWX_j<66sHdNLA)~2s&F^uUf;PU+RA0vsN2N~5nrj9DECMMYc zJW#hp{CIK`6KNkc{$Mea5*wZMUi4>B;oqKpLt1l!ub#?lT<-m|B6+{o zXau8IZ`;gbGD_ulT%(*%a^W%EL(;e`ZT7gCuPKyG60|~{2J{9Eq8hLQy?96vo4gmk zQq89tE+)lPR3jUu-N)=qSJxHvGs?W}pp6&CZFGW#P9O3QRsj=5w$45tFXXlc$qiKh z|JwiUUsZTK=%(>{Z4iBolHEi{HNlpM2_c z`65WbbXY4ozu47qIo8*NaN#fR)Dic+CkE^AE$kz4E?_yPW>1)D-gloEv3c#h74*GC z8PTfk-LT*=*`t?Gh9d{35(zegLC(Lwz90hjw`6o#dNUbBPUqi0gH3Iht;N%vDmaXBZl7CV52Z~9`jFR&xL^Lpv|*`Q=Fzr0#8`o6N;p$ zXqUzhkxyt4g!DAX3M*gJDvTY^?$R#`37Yf=M#|HH4M=qw^D#=>_5R%C%u`^%6MB8W zEZoL7*5R_c`7l{^sk3Zr<*s<0amNRQ*Xbj{Ko?og^-O101UfDioa&r>~GE1 zUg*)Q7lCJUE%KQLz#IZTY{QB}NTLDowVPH|VxPh~!@xKvyB) zD@G@8NLuHVxNSzQw5?WT5L%G1oAR05wt8!muisE>KIILd!z~*gSERy>hjs^lad*be zkYv6bhOu}T4z)g#;=%^iDCkeM>Oe8?O=7nY#G;l9r09~;_`LB`qxLfn4IPG{n|+?# z?oSzLgO+0GAUfGNx{W6P+If*mLMgZw;yj7P23Cqkh`8-gD-WjMDXd)-^U&$OXA~ny z^*J5J6?ApRKf&Ruf~S;xO}lVK#*KfLXe%0ceECbIlpvNiG~gfcWe)7voie0kvEc~; zd}C5xRLf!bgWuz%*IObNvB?sRcKLK0tx_%f<5&pA0$)dAVkhd~HiK=6@1;j=c_=46 zw!R-9In_bASLDykhpTsKe7^8K!xwnNKQXgET2O50N!R?s#Gp6iz0pY@*)O|!=nAZq zXlaxA)Q22CR2k4WvIK9?HQzzuMNr@5X?S~ok;HDsY}glKSj)`PcJ=$hbG!k{xrfD;n>3Jl3`Tk%=rpJWWN?QJG}haI1j&od)fU=&OcC(jy8lCc zqp*}pUjjS6QwpaAvleKR;AERW3E2sQYv;=gBF@f{RG#r7Z>wj7;T6p4e?`id!Xk$k zCcdng(enM5GRY4<@6WqGmKtCM%vEqqxQ7>Wj32T~|rzC*PNItkEvn}%uKx;tsd4>o6i8I!XduOg^~mx$Lv#Gh2O zr2UX@tKLMpF8^`}gxWDNJA>ol%xonOG@j+K{sTJi8+|$cmS`hzb7wPK#p#KP2iw0U zAJy%?Te^!r8T;-HjuYVo6O@#y{TQ>AR|sj^8L~*;yq;9cBehp#{00+Yhan{Ng6~jl zf^2ObJNsMZF*+!iqB5G9NdJ4Z)N(W zTl9m;*EV4+sqF>Y>R~=WAtvnqd>*YUvGiW0gDLn+kpQ*Du6ZlDuOOyY6@ zvyeN(N4DQw2dsxhau>GsPveBq>6sl<>rcs))`by+h}1Qkhz=QoJ`une7#3p2WdT($ z?ys|(!h%G!k0r5jNJbDL_TiOc@HrX-UQ!Mn3`@wv3Gn_&yCfqeA_&+_LQsn*VGB)t zRXtB{$e4K3G6Xx4rwSu}ZgEV@KlfW4x+6*D#jk>f>Zt!E$H<2jo)=*eW^{xGgp-UgB>UZ)>zdorUrMUdH}mN%wk!K~@$07Ngr_6iYc=b0$1x^r z>B{fNiT*))xqSEEk7tt23=8Wz?lkB-%Nk+ zuf@1)yZ&FhzX*z%jahc(M+D<_R{~#lKc)tohp84FcfNL=*&bt-{onGFC@U$FU>eh_ z`a-$o?wE(^5Kn8yPnaoJDd7;ug3368m>6PizkZkA=7y|R4vlxlxsI@9KTV+%mE&99 z4k2TvltgzDzvT?AuzHeC6ZX}~*<0-jjyd=GTsaIg|Mv?{lcy|XTAm@tbTh{ht;e@B z#4UqlH_sL>_W)x z30a^)^b7zm%2A;QofEPTQ~{ZLLSQ7AGCi_n>y$cy<{rZ;CP(1ux!Y@5G z$nmX3=PY|~^2B|?q2z%2uDLR0XQe0~p|5uJ?pwLE8}xT0w^E}LCxzE`ykCW!c7E_$ zBXC-+P)p-zQJw+`@Q+mtEmuH-Va68Gbk7!INuqA0&yye!+Z>`6RzPYRz! z^Qxu7toMqklrLgaAbit|w1+OoNfNO#V@}>^L;A|y1&2}w=W&}!f@aJqu0)Kak}p8P z7bPed)NAz`SX8%ZEakT38VN}!^Zj7|;dl_!y?v8vHgj)rjWCh;LE`-gE^LG@-@8h zZ58N&5OV(3A!1=EQ@ z5Fq9((|Ltz8TZ5TL}k0IE8R0%`S?nXV7c&(Snw%f(<8mV^tE78jdJZm&daKPf-B*! zt-)htymF(Ohes;2h~z4`u0GZTe~(tiIYd?e7I(YTp8wl2Q~v!5vd4;RoQNu3WFeA+4)TcIYEj508yV--@U}Y8CMmtn`tLQPDIuETnUq5zThjrRhoK2e*T%c{G zeoUG7xt0Q5v&Jk7 zIO%`py7$JkS-SFmidh(7^9J*2@6od-lf`I@`w|&40aH~%S{os}D{o)V9^ZCo2;A)U z)s1h{&8#Eiy*y!MG&yafv0d}ig62a1-Jx{U-7*`rw(@FptmrjivSREH4%+)C^=Tuo zaQml!2+fRN?;~iV$vdWoR6H`1s%}s7dC*7~(amUShw#9s^FjBVA{TV~2AY5EK-VDn z;bREt8o{-dd zl8uLL&c=LHZ?I;IA{C)XR~sA)cIh=ES-890dSa!~$Dsc#zPs{}YiL3x*;+DRljB7X zdWe9Sn<>?EmzcZuaE9P^;->0*}3mn0=R7c0hKBV@u10q%XAJoIYlMm^&(QvmF&sN zrox~qP-J^6ae}Pc?90cnECOKqtl$@KzZqX!>RKXYPT>kn-REFR6(P}JFYqW-|CU3i zhi*3%u4bEap2@s&`WF?P_Qdo0h0lx_^u{)2@h9?)1oGtLG0qCnKh*q+VVa5nn_%g< z0<%T~A!^E1{3=?@E!wxp$WoKi953?!|9^GI@{CrIj7#K{^!#A)wFXCF!bx}uS`n-F z?*4*a$yy4f1)Ln|!bg%Mkb9tc#Jj0=ZMKOO_NC5C8hDzYA*L6k8bTOUJYmUP(+(DX{R>s6JSp=o$*Bf zxSvSTJjm(?eaJ7a8`(z#ZH`S>gVcJ@r%)5*%sGrB5hwPxD^zerTnkD?R7h*^7q}wR zs@1=6Q6v#^#}v3q?}o2F3`k zhi(Q9l4$M2m=mn1?mvinOL*di62EsY8Cd`?!PLf0X|dTT)UQ^+{F{d?q>(u|+dk+r ziQ8^Tz~B5S2J#D>Rv4yt1Ku4go-8(z2JrT68C9qO7dy5#Zs*r~G3-0Tdf%&FZp z2zo!ll{PAT7%2!)8gh6)IK9JSlLLAzY)kjcfo)gKG*3CU;Rdtb$DCoR2O*jxuZ&Ix z4=0!DdVLxHVSOP+`v5W{YS(f}sqMnv0?OPN@pE&dp_q97DuwAV@URqIiN=R(+-rGUc+ZL1(0dqN z6ZTnDT8IZ{4(JV++j7<(sNCAT=)Zq!32iOjp0pQq+hFSL3p-O$%QS&qoO&^{Z1MVX zk$oF}Nfhp<3a%J9UhgL^$CE2N8>};Vg?{2cJ-8&FH4U}KQ_-?es-57S9vdk(f?|bL zaeVG!(LDekTA)^}gs~~R$>sxNJSP^`%UO*0(Z=ahmRQK+fszuW&PJ819lF?`Nef@L z)Eun)(UjO=GjeeAH2dpjr#aAFq=;?Oe`L*zavxP!&k;nkdxj+a0LaMD_qWA9L)O?4gBoxS;;_E zD%W(txfjAdIIy-~8~$=I>xuMcy{|iq40XIlSED-=_a&$oHD3IiW}UTSG4gsFdl#N` zDoprc8$&2OuIR&hjW1n~M7S5OuPM++xS9E*KmMgdo6JB;KD32;Mocf!hJ~=IFwt77 z(No62FtHP45HYf0lZ^`q^^J{GkXvJO-8K?OW9c9$qyYQy0yF{C8(h$1--SgQv=C>D zXCK5?PE?NJG}K0Fp&LB3A;$2E%0=wVAjt2E$&IZoRn4lQa_H zlAFq~^0kULq(oq6JH8#U67=3#itnrtj>KHnE!KY$E!U z!W$KG;hc)DUi%HDA%vWk-!#f1y3FQLtkaF{z1@lFn}~u>SojZ3(;tf^amPnWjE zKg3{P@i3g_G}?><=pT9Kx9acfn^K(%N_)bw<-2`L2PN`^=7KA$wf{VQ?75T5@_R$b z9w^>bYD7LO&}}Z54h1DRG{$J~C6%U2Z8l~{hi)x+>^Fqn(ZrEsWZDfrYLw&~=MDRY zg`;}Sp|Uea{Z+xyIuWHFeifH4uW=aynV!ETf9_iD?8p!_zwvA_nVJ|CVR8X?>F&z- z&q38xasxcSkf1n+MdiBfKFHL*F|wlC7YjzC zxl?iSOm!a9UQ#-acIp37PH~!G%M2Gaq3J^M_qzJ^b3+p>xLmLWO%%aY7rj>kdA8aP zl0SbszsPnUjW1h9jo-iONg0`&*Z2W^|Urn#X&k_)#7+c!D)dv4m$mg_b zUHA(k73kEq&9`Qgg_X-CDapc?gl}}!FFw;mM|&fVtwn1JP55qjO*m2bBPQ0W9$!UF zmOcHN!*1h3E)?Ig64o2Xwt<_z6R|LH;#XPKEPf5(dclUNy;#=q*+$ph zG_$SSfnJiGMn|)rpCjH=xp|^x^tMkR=Ow|f$`8Q&X5PDBitt(rM?06q;T9B=2sjUv z=mm5b;mPbFspOPYnAx2CCS|X+{C6t^%UeBFXEJ9qUZg}Jdf@q5vh+~<->aDkbq*S{ z+3OzlR?``QAlB!Gw}=8trGUcd7|oz)-}YiYu_N^4!3JXYlb8o4n-?hbe`tH_ptjof zYqTwGp}4yghvF_liiIMD0>z5EyA!0i1t{)bptQJCtT+^R2?Td1xSsty@9+E0dA~U` z=dUxvz$Dp|>>Xy`_jRqcuC?H`>cJ)E5Ugy8RP`$9L`-hM8#*zk)c3E9E4=0=G$xhCw#MD@2s>8YCY4S|t&ZVg z6t%SjORq=1_2X2)P%DD6XibXCIP2{re87Z8)smq8QAv;t=<;}^zyaiQ<8?!UKwxE5 z5@$3}dD;8|YOi~|+N8{rxbX`^2ga|wfh^MD9`G=(wMaZ^Ra0%oNOXH?>?fI18RDMr zsR?$=Y7(^p4$$`itQGZpIx5d9i`!D8fpvkuB$%OYu%9QAdB}yqRWjkX*#@zV0LA;H zTi?}S#PWL&Xa$9EyN_aavEI>bpR%mQ4hP0-i}?ZMsl-Yg z{mZAx;hd>gSvu-c8{w;F_r4ckq;4W32-$oEGe8hf$32LGR~nA^ar%@EsxmvC@_+)M zvPzm)oSzJvLRR=Ua!h85KKOAAf|W&Jkvh%4`-5JNSjU@Tw-kdCA@FW`pElopZODJ@ z`-3HOq9MVuYb`T4ZEJPT34P%zll8^J1O5~dhY070AXM*e6dO5x{#gsP7XEkLXi8!c zZ)%3Lb(j18uW@MI+MB6`W;X8mvtvTU-0su>A`_K{#!tMxz&N~&+Lw#YJNw6mMZGQx z-zL>IX1aP%aOg{;#7hx1m7-z2D9Rx7zizzNAk|E-%RnxI(AkkS{FYDvD9KksP!sdG z-RSfsDS7>}U*CM4r#8569)RoqVvKrgnT9|(_wG+xs%^Ja7+kIJ)2o7DUH^k~Gnb*f zR4#{BX!PEMzwp9ECLN|Q%Qx;u(YMci>ec&P!PSb%)i9=m$M*(=dt6*z~FGJC^;he zk>#!u$z^9wW{1zS-WA22f>TPBYo6LV?Cno81%%eg=K<8pkI(Em z*ULUdr(e>i*rVOhkZFvAD{&WBx#kA`V18YDRc*A_&mZqSoNr~pzIbi8U47<~yKIl) zUQL^Q7(pRY>UIK91qnc$LIGJzmC+#ejorM^$j@WjARO9Ysm4lo8u$&&#kRT5B%Ar# z)`eQ?WlEw~ry^E8ecRqFYs6pw4FR#(Xz6Pq{p>Cfq0&A>mcjvwp^C8n)ucta>Ro`< z(~Z9Z$85(f#v`Dgr3x@-fg~pu8HL2ZK*v&)YB7H4+RH1~{Yk|C$Q&AyA?-h(J15vR zw;=!d7T6vNK_wcO}FjP?iRP;tmSpsi4j*NL{yO!PRjSl3SFF^_>kPuXvNYq=5DPr z?-LVe3D3<`6zH)2&ikx}&^9sVD}$T73G641GiHp*YV_YJLwEZ`=5#!mW-RZri-6;z zx)y{2@%<&m!H^WQ^Y@c2F<)k^mgcRt!kwXS{1MQr-`?xlro)v|(AH5!&GpvN&(ef} zc_%qyzbP0nZ6dU|@(qvH{Hr{t-G%s%jfCCFD|#j0OF;xAd$_$!vk2Tbs@7xUVrg!w zb3c!h8w;e4ccJk1#SW9sJ|8r{h*wf}udP~tPx-sIezECpPUQP&Eg|6@Q($BUou54*?gG0Cb?2SQ)O}632#Bdt>#)_j%TKG;nXHqTY_!w$ zT%9>MKh*smjeaJHekr;zvHd_IMqu$3$??iOc9~uxhEI|KaqVrIDcrFzFYV1cMtg2dOHK%R4yr z0`9`SFfVR*|#6%*Po!qIs-6@n=4ZMSI| zIe1|15~I{LTvnH)0|)CoqAK^P&Wcarkypy^Ev9C8S%sTi{c_0Ga6Y>jS0Zu1&VN#{?vOVtZ)l>y#**Al!4L$h_0mzk|W8as+LOj4W9H*sug@?2sX z{PrMdiOUT2mqe7Esh{s?WKcM)7&00tIskb68?AzT#3f?O@Et3Kt__Z8Cs@1QlfBkn z_jw4yT;lu(4x0@;q=$@jhLv7G6JXkDlu9jWW|;q!owOWK*iAR)Louqr1(L)_x<#J$ z8$;=&H_RC3Vgt}vbsl8Au**=7pW*|n z5yu%>>jsFB_@Lr@c`1xjgFP%2$|^OpdyIZQsyy>@DdgejtTh{M?&igEC*=EUe_&b> zQ@62T6`P~&z48WxoQkVi-tBPB1`)nt4U5so2Eppw_~c%d>6c*Rc1N)TP21SQoj%`-y0?X*li>2bBLEjC zs~7f@+cARNt!#1x#R&}3pR;JB8?^dWl&k9R}=GG0;NP$QEwQb)f~ zaW9-onKOM4x5YZ@^%zY7i85f?(T^H-E+-Kz?&4{!@+9VGKJU&1&we610QnFyC-g{- zV)9`N@bBNgi?+WH`S~H44)ZVma7gC7>gw{LEFC1qPqD_^>^pJ3zf8{^V{V+G ziD~dwciQ0E5SMi(y%hIu(&}~3n8BU5$csNeW74jS;-Y&QsA^h+_*x|*8Odpm3O?EBz) z@E?HIBF`uz?#HY%(+G)?M3(`bzeeJ_c&WBYbBexP%fgrmv{ELCH}Kg5!S7|OQn4Wh zsnw~yi+4WPRfzr0&5SdFFHZchFnHy%WFX7K|O&Vd@4c2IwNNHLq2mZOJ0c9%7nnn$i65N z+Q`hX?Ug8s@GC08h>wLqUz+p{(rDfH7B9H<6PBq*=L@o&IH5ev3v;%FovWV}YW)ww zcP2|Pxr>$Gg*XcO{BTEfID{z!=F0ch#4Sc!>yOdv-xHEA;nv&8u4S=oEMW!KAuo(I z%N#{P)Z2TS@~zKKX6C8^ zr(1_KCRid{vd%&C+5QS5GmS{+V}?+pR|^hoSVp#r?UStF%pFoj3Wm(xRuUWJ`p-5y z?C%DS-FG(11rZ59hu7$H8YnOVkRMPs(9hQ+3n+IlgHMQ9Qq^bF+VNRZG4_ctyCo(m zvFy{ohNv|wY08&&?L`wYm`|YRHiYN0OMCP~ zO%D!)Kti?hAFcyiY;vBzQz{z5--(sdSljSL4AX_xRJ%ejkis0rMCBBXHnBz|Ra~*( z^lg2bs>W%vdFHEs+{OEe-}Cae@|K}5^NWl)27_AkBhyU=u>!wikxhB(qz=u!Yg@!< z)zdQd3H^yMNttnE>G22A*bb(20zXVLmju<78rp!9n(9W%(NWAd@ET{nDC3183V_)Xke#{iAZ|GVvyI=i;Jv|01{kv3<}TeyH_`% z@uWTtxYV`z=|o6wA3(uQu_`B%{MZc6=Y3!%3yxmq!~qDg#TXp<{kHoChdTwI`ZsL* zyNo>jW&|d|eVg2vB!z4#CR#HHEs-Zs-_svq%9Pn1Sc8Tij!(AVchOUhx>x=z%i6sf z*DV{DZT=3oxNWOE^S?1*=sZ_G4qg5=qE&3IdJa}Zwx9`pw*4PmrTd=t)}%S>x3!Em zoSp4wM_ENJK|miuBP}!&ns8kcPwnrCYV){x?M|pG&M=9j&xx2clrKWM9oW`5o;%Oi0F1)CiwCyJZU0?hq#nzPPsnlk_ zo(pIOHo&SQfuIa@wCUnOLvCqgt+=}wNeqtR_KP@dSupG zb>hIf+(vp0Z{=ikoU?|`wrDc5ud3HGf6VL@(ftBs!M_h}K8KV8&^)yso7_L7m`@fldVqMn8H?CkGbbVwHm!N%4$tnJ!SQiRqB%TCJ zo{>g>aff>-3^Cvg1%*9w{))UzT0gn=bwVcx@m>pEw^k!1;rN(SlrB`t3UmER=3qkf zoXdyVV zX|V|=iNvo;HCuyCNh-hi9pSRY)p)wxYqxVKWi`LUpAF#lbQ~cSt9&;`PAs2M?{(ZBNF$_2-L53Rnfz4Q7e(>$=+v3sKjAJ@ zeUqez!+<*$>SR%m9#)HAK1SJ|WXr6+kS6?8=je$MMYPF0Ek7|iuQK0ka;SV6Q&W(u zUT}7n25^oHJ@MB3WUQ@MY^3+p>j#4Rt!7=~8=!g9myp#+ib516Kt%uD(|m-rM5W%- ze%HtB*1t*|L}Qc3@MC(&!aT}_$RWStM)BUK)~VL4r6|(OFRW&e2c5j>nc34`%$mQu zWXVsx4r=+rsI6MvJV1n25l}L~=&mTS7A!{c0U*7lBZ(=21*oS!sjCnibGH6we9-Ww z!SVXUs8pu`lU_agW&OR(6bbTe4&5CY`;4CXj#36xZ2HGFMAWjM(igIxo_q$v{wq zUeLm>P`WVw7uYxJaVK0TKpeY>j)YskbSgh6k#WTHvj_KUzm18b)2E_crbn}oMXnLw zUfJLu`n2LfUGl5KfXre+>C#H8=DUwMVZh%f9dOB`;~c$ zEJz9lkK-tiT8FPbNX60Gyp!`V+lXo(cMzKR(80Wao&M)VLCSg&tK|27m2Zp=&j(dE zZ48me{}K#h_wwUgt8>s=sRz7?l!1ql&vdQQ`zPLhJVGjp7S6aZs{DZ9QM<=)vc8L(_iI)l zt4|`ky5vB;oY4WLIBl%kRX~F4w+VBtK2c{15Vs|nAl+9P5jFa#SmEi&eARg$=a2V_ zKvgIW&)hxj%(6l_bk|pFO#l}!7o43YUfVTvV5B3T#QOZFI@4G5Ey(Ig*5`rs)eaM) z8-3#43iZMtA}n*`gKI^j!&jZQ)C+Eu4Niv>4wDxz&;%Toy6$`SztLJK(S1nO%60)} zY5*HsBVd5CSNs&*6^b$ZK7$8)uHT*rINjg^e1h;x5sHb4ym_-XKMFna?#5Z z4kNV1!^Mbv8JDGKkU;k_MZ@~f1!7YGv$4h-3|II3*wce~9^3gAxoY3R6lP29Tcf$q zA0*cYb+>OMo?GY03I3%lN3G#hPF&)kl}uKtRM&N<<&(ELbDp1#CeeWgaWD=({;0`0o| z=E5@?VWHyWR~9j#EJ0L9n!B`3*Dp<(eYNZer?0*vT?UHfQLr{Iw!Wl8FTfjez}dxb zpG)ke+6Y+#8FS3j!D&|aE?vv>GLsK*{n(p9vtz2L*@50cJ5PP#E|KD6{w@A_%Gv@w~5o zpYPac|MbDIHL0`tEMtlcveg5rZ>3_`p6IW5x`|19acJ2M1H!fCt0FIpnXtsWNpGyv zwsmiKEVa}LJih15*IB5cKri>9nw?fd$Ea2I1{Y+Re0tsTx{6}!x}{1g)r`t}1>yNi z;2`N{R{Lx0O&wl~$Z~zpUmY?NTbfrb7cRHGzcK8DueO~^VJ8bhhh(E2 zl5UxYfjM!@jH_a-6Wbl@YldxcdVHpnBX=fPS`9xa$w^?QFZ-`~4`Mmn=VwNe2hyIc;|{)4JL0^}o_ZQeQu zAXh>lN4$#lL1gM9(%n^09@5>)*5wn;mO+=lD8nYjPOzgcT@;;GmW*Uyv}npP2crgK zG!92h%7%Memic3b=x6AjRtTs{IC>H8t+>yn2TX#OYdB)*>DjiDc7Me^s$> z!kLt?{%M&5`}ER#F5@!5o&WI z#*Vrmc$^I$VZUSiy~OsNd@Ixvb3^BlT&PYT&+Kf(>N8LLs^A$deemP^{KNG&{J(7u zY?NVFWDN}q9Q)jD@CH4H!`uoRY?#r^>F>7h#qg`TvveOA?XN7)<$A4Ep`Ko1kOY|t z!Dd^Py_8O;%QZYycH^qw*9+ag?@GgyP4~*(z$Hs#V*$$cI41nBL@2Rh6QQ4g%e0z0 z4yobw}pE^kMQbk&a?==`WTi9IgKRub_y2Cx1;eypwqweBls^3U*U97q_TZ2zeA2{Qd( zmY3T%dOW=m5fKf@z2`|kPEV>Xkc|rdfYYTz=qC!KzkkTS!J?B(f1&4nUdO857CKvD zP~aZ@GyK3XtjwmEMt%b&nQu94AI)PCvhorz5vbD^M_DL!G&;lvX?zaGB$%k4I$k-x z1ZHy=o3DuBa1J(wAoe^-z$9&M9D4>nQoS)aS9iq}sp%*^IAixsz#k_V1*fwbtkRaX zJ(^RXJZkUg0SN&7^=WPA9Im+h`SkVm|NBM)aKii7V%vb+?0*SfQ65Q_MVZd_rddo! z4Ul5>7P(xzEddF^d^OlB2iD+_=Mv@)s3keZi==1OZ5QulD zHm6EeZLUdH`%*<~5eaDelfgfSZ@E(Yu(lzwL<0O^Ec-BW?a2?tsRkzpX)L`T^`UPj zi3X*#K!!fH8d`UpmitPMlcty0uWOYx?)1t*{3oNBl0jR(Cy>ib zVjEIZ!Mn;Wk11zV2mSVdj4)8qkj~fVCVjb(u-7|dzw6Bo|JOyfUHy^!Tsn@)JZTEd zR&;HwQt8^w#CC~f0oGy>kgKEIkAvx+yEo`M8p@LQo?L8XKB5O-)7mXX7d*%Ugp+>d z+C*)|Q{#y=E|dyv={sGWOK%4Rar*b5DpJj&0T$5rrJ|buFaOG$q_u|>Zx$PYqi!?- ztlDFdXDS7~U$gXxgE8$2DyXj4JjmI+6{mjgqa3GnRQe(8O~%?pdzPlBDOhH0Wu?Qy ziN~5~^JQXKOOpBXq>yh?%F3c|^vTD5fZe_;#^d7^b0md@;c^;@Z#R}GoRCtFm&#al zhm(-iTi5s<4ED~Ia$bfLiR1isC3`_cq*%`-}QA^Qy56z*N9Tj%XV{f9jQ z$xxI$3Qj6oVmj2(=kJyiHgoU45PMlbEI?Q|y0F_(hQ^zbo7=~qCk`88=oG@W?)-i) z{WwZPIx&BF^iKa9wsxkVGF>{B@?Z`>vi31_3nGwtBlx4|miON8^Y4FbXd?Y3f%M#e zd*gq8U%LQap#MDmpC56+4&{G8`2Y3AzsQ~sw|tZ!kjeUpXfi$U`HvZR+w6wx<*msc z|DMfD{uia5{?{jB{C2m`7${`~7@mMES*H z>l3gHo7A+h9tSe$_nIFA63epIFHKe~F_~gfjO^^mYH?pde9JhbZ6xhi5 zUu1d~BO1Lgj9qu9(D!H9`_yO`G|RNY*|R8s00}QA!xyN4g@p7|8^!7U4bNbFE})Zp z5jAV~XQMr3rN4wn%Q$nC%Vlf5qCShu;%3i)VP!t1<)aSby+MuVK@WHChKq9@5?}s6yu3^{hJ>`*P7TQU@_jFo?REtG zxZeplgcVqfXxHEJd3t+wbhZ3j6}-uODR}C)`S%s;Erd`$xjbCIRW{dXeP(CL=! zWsK)3Fu7f6n9SyzJNJ~C>wghBPIs(q-v{8G0#vc_P7lTTmQ-J!)&pl4GlNh;#Jwo^ng}0M}R7z@1I0(eEJi-+-^|TfKUx1 zwfA`{lUC(Rzy#y1A~7Twi|RQEw`oXpbhMem%GR*W&TKIfDc=t$Toks}d%^Ezlu+FmgdvZi7RG!Rw8|Fvks=p81TrIX(RzKEKS1P7cji1*`iocHb2 zD}|IHnRT%GSfdd^sX@D{bW;)$N4CkZxWag~j9pC|x49yf#idhke2)vK*96L9n;RZ& z=v7BQj_>J8hv@wFcmXo8sP$JNb9G*>KD{c76U|X76ZOKkZY7-?@Or&@96PbSp(fB zufggDE1fO!hNC6fc`)0jMhzt1BRKM`{c=^QdZI&+q(@p6)(tAY%-=+-=rZT8t-e}< z#N*jQE=qIrA#sP68TLypF-J>Z@+2PQci?q}rnuUmV@nrHxr zKC-b<(CF_*oS@%5Hjs@JT=em0fz|rnT(d{B(BqR#OHBvos~nz_T;hbG65w7M9YBYxWfQ}3PBfb)JZ~2XuqZ`w zAkH_pA8U8TU%s-LlB!yW=JY*vZuGkL8KvnTZr`r>5`}w_k!#voWL#nCVY!V062c@G z4?+F-C0}c)_s7gnBubFbpm1kQF9yh?z_@0{xCX44gQSx&mR$FrWfA-$(bM8~x9(TU zrzUwj+J&wbJTojxk#5+PfW2<;F8EBJ z&~JvLvT&4?VtcB5&#tK#>h~kOF}@gP32hx{;}z0=9UBn$ItzCt1#+V>litlr?uX;RQ!yUB;$;qm*Yjk>!EUBRt&h{uQ1hP(W$8B- zSklBMlnmzlxX7%n?Bu`JZ3HEU6NO+idbYQgGZ)!7VnY5M!J47p>Jow7yEBY1-tvnibPH)6#u9|_OfffhH<=B z8&Ycp>sq98*r1FgVc|FKiHQo2qbc~bs@k8@j$&(T>+}S-JKvoixo>5;tapr4P33b5 zoTAJ7T0ATYIigD@hVc0=XaTl9q&*6c)Z1vRW9E8oPHKLmm6t}{kFnXJSaH!msOHOP zNk5l_TSX-sb;T@Ii-LH6M-xZ!0hus+U*S8`)g<;yt+9<<#w)QjuX-5lVE!msKsx%M zFdwIlqrpahDy3d%Y_ifv)T^>P%-q?XHy?-vFwIjhFjqU}voQaydR}lmmu~@)F{Y|e z-c_z*(@3KXPCQBWaY3_0_Z}@oWgLU2}2il|rk{e!Z2O!2h9M%pCX%ivfqdXl!y7@T!OtY-Cpb4o+qj zik8yB_aDtMdims%DFk_;$(|#-R@C#luE(&#Z^LaJ(p1_lUyZVI$nW!;-J+opRdJdBLF_zD2ZcM!vvf_DTdum=Xr2~=oPdbu=!LqBw-SNUKU53(*Sh7t4rk0pvc;{E!#U#EAof#bF!VI;?e`_B1V$EQ zud%3ZO%EJ1G<(Z8Z(6ZQc`YNzh5Te+nSd-g%5|HVVC16JR2)M<>E$8a;`wINvpwUm z!2w3ME#!2M4aD1p?)md^@-x>|*;sw5(J&bx{*cT{u-5zW>jxZKvgSk#z?=(Pr^({j zeQsf%1T?p#3o)jJD-;+Pl4`YD$GL+4x}_b83_tW>2E@}34pzP3J{ZH!aJ$1~2Yhmy zSROkXh5QR7HX~6Sqta+c_>va?0UJbE z{rzJI?#X@P z{bkA5f{@)ww&ykSO`6^9JC5{p=wBcX($E$cb)7a&#O4{DXuwO7QGe>u@Otkt%p!iL zRV)jhw}9IO#ga;@nbit;UWmtJCfrr2eeKJmwGF|kRWeM1w$dDiiSTXhv{~N><=nq%<~+=L z+B*F%7BDWKCl!*oLlxq)urY9Q3`e@Q*vGVm`F~Q?-$kkMu0>2*{7^f^qTM<49U*#` zC_RS1*?e88?~iPLDdK(oUKqtH!4hpwB8}O_y$boLEwua77|&*QJzZiIhR9=$#S@3; z0OpGY7I(s6Zo5>9;Kk&%v`fG=SK)-;Kkdut`CTidetjT8c8l`#Tb@7)hr#>KV7YH1 zUhn2dt%D>}{Sj^0&1U-4lGiO3wND4Tcr<6UGLbYemuuWZyyLD60bgNqIVf=zHs2R? z7m*x@@LQ?7(LlGDR^;5S6&$U0mxd~;%}+HxBqmXolR;5zO;wFbJq#~hv~h9~Ej@gLd-Yqqr-d{bOMR|mzv5>qy4q2b>@8`kMh<7%Db z>Kvl`n4t+2lZk&mem%=g)lhowf3ZOC1Qvfesj#aR!W;AJ_=DyL+BNCa3^%Gkm|y?s zrBH9PjVJovoj}EM_=$(RQ@o{!!FR1w=LzQb+eu0sSxX!NC5c1mC^+$$i%(ZokFeoc zt=qA!mghnXM`V6&E%Vie&QIB?a}8nmWhPUgd?`=u?V1e9brwlC%JpM_9@N- z)o3c}hsuOX<5L>$mlpC*`l;6Xbu5WVwv?TL?(5r+Y*H@dfQe?o&!lh6Zt^X)wE<(j z?z2xdFJ3EUhCcKu8}EjGoFMsQ);+LU0?pVychSh$JU?fa2cpcF-3ECB57z@oGyNp39qM7eX1R+0^u7b;9Z;QX_m>o zte(H5-}3GBVzm>bmHd)FFv1bApe8j2M+c&;cFA=wq4rf&0LeAK`E9=bEL5N|TAT|f zrw%kLQ$}UkjL0gB#yPTZ+FCP%?!j?n&NdP+sE4m?Y16Q%haP)2)yov$L>z7ZO_b=J zhK1I5-29p~g~*(;c#ej8gnyMnzUmwv9tK7^-FYX#^XYflSx#3}iv%Dr(@q+uat0xP zsWRitRVRvfrr;?(O^$$rkpcg{GCqq&Zzdx-jcg2!jG(r`hDoJM;;ihPSDK)lvxtwQ zff&~6EkI3m!UoZ*pwZ}1$7_5v+X?fkKMWvJD-#s|nqLj}CLMcj-0+7A=R$08cUI+!U~-=7viDWp>liI9cHr4=;+=c_TL8cI-%*szEkt!|dF!rxywBXRIvR$1Nsa ztvE3T+R;Fxy>dgKO{J--!u^9S$7)_8Bg$=c+lAApkBybQdC;$bsXxv8o+f)z4Q)2^*0({ z<}o8Jw%5~%>@p~{)=R^U<#g`dKcT3@`r%kw4A@UO6lhSGA(}*ui64Sf!n+1yB`SMYhKgMbc6(_qJsm_eZh;HHVnIt&r@`=lJKV-;{SnSd27kJ!hYQ2! zGB%ZtrIgB_t=r>`<2vpNy98%wxQ(|vFw%X@#x?t^LD(#v%C`aefgU^THe=@FKzsa@ zqellL4q#)x4h71AJf$UbttRKZxfBgH>jK$^nYlK*0$0i+`6NRBY0GfCEngV_MYNtx z@acN!%0k_sF%ce};tORg57)!RMW0Y5&(*!i^WB=%{d+u~;#fSg*}{YZ*B^W~@KZYr z=E)N)9VDcK_ofoUzsCHiV!TF^#n@Eo<>=JZ9_L%4@mGej=nK$K67k;Nx3x0v&FM^a+c!8~Nrz%f4-Ti9MJr?aoG%v&v1*r5U7 zRi5>)%iip|czhqi`kDoq|1=g<9dJ(RgQ)+eth@8>@PP>1j>UPNlZ{(`TK1KPebUI9u$nG0^{C-y@>N7DL~nRono){d|tBAf?2zz^M)?)7s$ix3Ew#239+;+!jt3Th6&Z%fLgVm+o zE4yau)Nts}Qv9FVPS4K3kMS5M{qtlz+vO4)OyjIJVpxaI?4B7YMi=*+cW3o%GDTXU zS&mv_X&J`h*e&oz8Z7W0bkr>6dam?x=DQ<+6$lw^tEwlCtNsNn2w@V9zoX^B^MfC+Re0v#nPFCsdf0GkO;3FKxKRVfXe6vPAA!&o$Uphhevk=4v_xGG^Gur zRcR62x(YLWQ2bJf0_=5N!D36_KYscl?(QBpf(<^Q6%NJ&4IC$79my(ZnzD-IOo$+Z z@I^j0hZC^<)a|?a^jm|%<)fyf&$1**{5iAxp=jh90r_~f8`;LN!N#GBhDux5{}7jJ zXcS^Tb8~eS2(Aab+Ge&dwSP*rYdB}G8F0kcj(u2ag?dmorMiTH0e@+tUFpy8kkR~u z8=IA?C{Sn&F@e-VP9397lSs?u7}N6|u4T%w!KP^S z?|6|GBV9vwJ{$!nTaBZp18RfB%EEIq7tp}dFWP`t+Q=!nYhY9MwvnrtTG4@XnQQ)= z5b@FBrfjc6*R_8>Aa2WV`Q73Jo-2O=IPzL$Rf#wx(vp0#b|8VqZGvts)mh0TM!>A- zRn{!t_DDgbQ?MF?oN2vWB#oEzz?3}4;v)TCKQhVW_AV2GcriwaH>CN3JdQ^2tySRl zha|eHQWNBkD^w2|>RP0hSgT*SX?N*}in}>Nstg&(pQ~D=i0CZCTDXS=-Lc!u*ood% zAb-Hxd^+H~jypU@^vjV^w|B}7_%pPy>Icn7=pHQAqIz`tS{ADokG$Qz2YLxG)0EY* z<)CM@h7LHR4;7)Sq|>L!xL-GUG$FSp)UMum$7oq2c(~CQ3|h5_B^0Wz;){r5HF`nP zH8S#kfuLyGpxGC>m|a&gKzF`}vQRGhWsl!z^m2`P?^2jp*U#%#=bZ0v)&}?o@J zN_hNmmNW!uv6|ISdyGb~*97vgOxo-<=lh6Sr}&|Z<%4JhbuG>U=>=W)(Af>zBluwM ze?KQvkWC}z|FvMj-xPh(;;D%b&geZqfOem*8x6uAh=t9Hv~R&b1&9XkGKyjWhZhIK zFch`L?O0;glJZGHww~|pL<*7jiI1Xje0HRR$^T-VOvm|adT&k~S(&4T`F^P4=%w>_ z|2WF}gox$-K+IS_vIOK$v#HCYu>7WpM?GfoF^axBn4lS(ouL?d%$j+K0u3jaA=4LJOtD)c`;{xOjr z|NDW=>GprWsFaI+Y;VtTa$dp%@Um9gYvN6=yX5uuOV2#695NCOP6O1_Qc7Xrcd?W@ z$v$MI=l65-uYjwSsu*8)I_YKx+&v=5JdTXl?qzmB_PBss%$Ir%{PwQt3nBd$$0Yt8 z123zi5L(&G^|8FxEnx%g;UI#=M{L}Lo(tQfQ(&k~zb;yYUci%av_?)f@mFhH;Q&LhU(+5y_AhfqD z^rLQm&3sAz#M|DnNhZK$`WHP@(BpXmA!~1X8Hm;9X_S`H0zHEtt#35Df%SAX?H7wV zyG}>~5xX{UVMmnNJ7UhPgcY#x&vNl^RbK)zaeyyi^JV8S?_tJ#2b38meUDOe9tXi@Z&uEy zUWv*o|7jZ3#Nqk!*~t@th|Gc!Pap$*GTG~qAmq6g2o>Nd-oSbhfLU6cQtXqmjFfeb z&@9h$4{>JF5foqR4v+aIaJ&*Rs5bTMGv&n-Ypmntakh?18_@{#8;^zl27s_hhM<~h z?};192BoBEeyhiURVNYmZx26$dQG!+v3R$?zbl3NiqK6|xkrQH4cdGKO%|QtN{^xJ0M>p7T*oK^%3Im18|J-`VdUJ{(>8Jo)YY2n%ce z{s%$ab}CdN`Sy)ZPR?vs?ZQU2C#J0E&dlhf>jap~?JPDx*Fw(D;s{cYS? z*+pczg?l+F>;}Wrp5?yWhhv?8kwYR=>nt1514%v3K52Q+iSs2mrYWPj*_HEJi~)wC zcGv{%OSzC>k}vTQ&7@&#ip4m`d8)N}PQCxZHn}Ox zYZUQVG`uJ5!z&qv8ZqN$-TyhURvZ6B(Wv2U;$B}tz*e$Fi>s&H7u_$6JZ9bA&tmyR z7yr5~&#sDn-5U!bTEjyPPGss8zHxoI1keb5u%BHg-yw_}FKep!$K3|IuxMh0Ne zWpt8zR6X*yr73~S)KMAdA=qCw`XEfLIA-9{ryYp4a2cxK880IMvhiZ~XDeFV5_3c$ zk*Djg*B6@-0RHfq_myp!PJ?~F@Un=v=W^+BtnXbax4m?(laI$SJhIJKI9D^4f5+kx zFoeD>!)v$Of6b~bB3r-eRjfq9rsV#9H3X;q$#y;f;01G)ZDb={UDz+R#`8Nz1an}Q z6t%W*XcI*%0qp6!e0;xPA^ya%q>&TiNc10OkPqf2ad^2zC%XJ@El039*R7rUY&LRo z(!z44Mm(F$;$j!Ey~AnQ2+cX>^L|0WE$AnI4wp@tLTqi*1B<^=Z1ziQ)3m^^2z4|eK)@8AzKg6?WvWH`cLbihE6c@TJM&T4W!*(17Vu1aa_sUS( zdNp)g%=cFC_TrGv-SH&Xz7evWTw2g?&}^P|W9t$f4U3>$(lP01DXC`9U0K}?DM3qF z`x~vZvoxU+Iup$KLdao;HJn&X1i-@qD&0rlq&+rU=>H<_ucNAb!)Q@dMM|W*L`u4) zOOO;Kl@O3_6p(I^?rs)HgS2#acXxL;i&(&Y`Q3BQ-uI3(_BeOkGsf{B3l`t{*84p% z=kv@NOym)B6CUvH0A|SN=HE&=^2eNIVs`}9UoJJyu7YEs`e*Ku6_AB8$v7nwq zO{#r%=4^cP$9b=hY$j)~8P%JS8A2hh*B#OaZy!I?4!ML8p?813AF0tGdktY%J}8Nu z+gcbJPE9|q={0H|s<}H^^+*lbh05464qBcoCWcJ0PdNm zUXR@7@6Zyk>UOf?C7+ZXpNsg!&?a(pwF4LMAu>T{S#@LIq05Q|o>0(fh;d7kSfv;FYqa!}FshSNFcAl`YD@5pMIk=ie;HrBx%z?*HPP{9|QV*T8>$ zp7*F+r}#h$OE?wD0ZT|z&0shJZT^bPl^9b`Y(G63n?9B{Xr+}v-x*(`ZZ|JSE{TAY zcuM?;k&tckn7T&yrJ%0QJuJ51SnMCcWoA!K^kBsleYQ-?P#3~vf1KF5nFqzkzc{p! zP=}d0ABoJr9vL5oEY=$2H7amGq!&3)nj9^Z_zFbKf_8)Bdf8^TQM#onfqZ zp?XC}%a2OV)wFME2ICJ(?7b&cDEcNiGut$8YX7)I3fy%N*z`Ii{u$&`{dK=Okd|fb z?uGD^?M>QK!VfiEVVMuu?83DWNYZ^pc;XdmzalRI(De5|Pi)atY4H4 zP5Bd&hHu|g$omjJ3x{YL5DpV^OS~@ghdC^M+Eh$eBRBY*j6kULPV&0+%H(?i-@j~m zhSA$Ml-<&cVO5vrnTCgZ6tE&luC|(u^!2#US~hi zb`urWJ6VcpT#Op~eJ+I0Nq$JHZL2M(%29hiJGm-Q!Ke`(rnN-(u4`(uvzF3y5#8Z2 z_88J*l1B{Y=;u}KS_3wK+Iu(Y)2LTnzps+0F#ZI=s;c6WN)`b2J+k4BZ2AcvG?==> z`tQQ zRIFFzdRaX+wz~pyB*G!W4Aa2*a5jQmUuB|jQp)GD`SF&S>V_x~i$=Ux2{+<>VlB6; zVFN4mLmH<{r;19Rn$`hfyy8Lba0I;5Ij+bVj;TUJ2N{-~kn@J;0gc{hKwGof`-ov& z?(Z+ukCt`2U-I8d3?xwQ?u10R+~$y;i8{5&n4XUpN*;n-cjM2}Kpf;O&b z$rlp)V{cOQ)YU1Xa|W|J=_@9eEP=&B<@CSK+`t`-?^pW9@5G(!HhMdN8w#%S0UNg;vN-AUwBDtg+VH|lj?FD|Ry;Hc6yVk1CLxg>kPpf4sIKdNi! zEYNK^R#t`AV`CK%@DdqobtGL-BH=FU-IyYST4{~grdzH;(*;*I$C}6T(jo147`hni z-LHETN+ZkVx-bxUb=v)6)oRx)E#@xJQ?yd74go`jCY>^4Ek1JxU2{EduNZnwr~Joe z903cGF>X$(Of2BYN=QRA2gADy9q)~~5dlpA74q{r1lF5K_wC}CrJu|NWmC5FCg3>J62 zGU^J%hUSm{RgpsbBdzi< zm5)soYuzXNd}o^6C>KIW4JVdAP2`PI!?h%tVq@qSp%)B3DrRB$Ihclju7?JpmQq{^ zI-vEA@p@b)Squ+k>=}M)(m!@<-^95pED}i;pYLRZ$V@+`IUnPv)Y^($;FrHJV%DN# z$$@G&yZJZL#T9u1YPyf}zfT*>3aMi4-drapxNUI|Y3FiZC1__TI`?hZns!;uvYJZKh zJ(AV1)hY=_7m|uZ&B5SuoA@W`S4EeusW2Rb+^K1x#*(f-i^WjgCx_ym2I-Z zY0wOm>QRRMI!L&lK7nh($VPLw@mb8{#iek^Ly%-gt4C%~0{DH&A0xMpGN>`?xNG-^ z0_#KEEIG*c19Zv_;6LYKd>v?OA`w8_9#E(*xWILdICq*Yvu&mG$ow>Jl+0b^RY1+T z+zwga_a+air7CZPu1e`WU^QRD5?crFuOq}#}oL{gviN9~NS?AnW=j^E7->Fe*T*wMC zaiY;G2rLt`pYqZg{4PENrs2pt1-3*S+WGv-9x?=wtc^5B6L!vk5#iC zv3Owq;#|R7y_bMA?kx0V*(FhW!d5yTGu9Sc-j4#BVr6;IJhTSQt5iY^=#Qg~W^{Kv z-1B3yf1!1fPY5ES`-LN$#)SaHzhmf!7=60+rm<8{srQyDvf8Y$z>*gQ&*CsZsA|JkcN$5)aqh-!`P!!*A1-bFH^qKdKKS zeaWb?HlWWEH|LfTV}MC2Z6T9ttKcQa<6$u{kepta;pF zD_*SgKBDY)|4M^Y!u_cufc*;Md-Zb>(7^Faj(dBra&sSxUKG;`qfE8w@ZEM@7K8`} zG1~h=A|gY7%vIS@>U9PD@TrwILdv|5EzoW>V>=?`?@Z?{d@(GVc5{Y*Z}A0`W?!U| znai_g@#!_kNQ^OoAt7U1vq>5I%N$vOWS%NVYR^_SW)snf@9W z)iO+ynImZnNug-oj1skB;JP>O_;ut!F@O9~vB{#{sFz2NZ3w<#x@~qQat!_~P)jzR zA~7U;+3BS8^(3$R(Qn&Vu1vDq;pQ)4*R?380|JsW&s)w%-5Z51{!>Ga@e~I|6V7tT zdv#J8C(DjfOUa=A1J@tx#LWF^%>;l(^&L?E74$5(M}KltzZ*?jtd|ch#)CB3A*u(~ zv+BlXqQgyitX^P`oy9804n-lPuLub@j#=amSsB?i$_k!WJg_-{fpOX_wKvKegm&9F zN*@uR5vK7xzBmwz*=5aU$PWUW#^*H7Z5M-9SGtVnId>m)A>1x_(rge`%hmay6EM*K zVJL)9Yy^Kec{?*L+O(ZS`wc*H^8YSm48LXM9=FURB6H&gH}eLyAzHt4w|)hNu1c@tAJ1=)lT{PU>3; zpik#2Qg{Mbva)(OSA`j=RSqxFUw zfV!mo;>aFb;um7mvNk7kq$2KTTGTZp2c$_q71rR4lyV>2b;_skV1|W-5%BB2ih3vW4y`>mLD&IpYbe!=X1bc( z^}0lcviM6q3L1%!zhMDXYHzc;%`}WhOKBjXOngSr)ls{vv|K7`lF;p9N2sB(0aH*y zcHQ=2+EjOsKT|101-PzAlNg2J8WNdhNktkaPL}29_nBN5B~Odl`g4b2P--wfAd56va;JdH8z? zoaFeSjbFETiy-=$^gSue%`|OV>+nTJO_&G!jJ;$pE#^=ak)bA4t=IIO}`kJD>ge-7%$?sr5&tG8E zzrS{21&H<-Q9B$cV`=wpV#25XJfQ+k4@SedZj14H^oQ@YSn**cpq$nIJ~Pzbj@`@g z=tU*1{6{9d30N~uw^aRql6BR%co*zflpehZjvEJ|i8j+xbJx|s9v?`{NDHxsT@tT; zSH+1YXCxfPSNXHH{XDcfLoIioDeG=a0S2mZh9pDnrYji|q2!~E3@M!6Tv6MrP!X3d zo3b!OYdbgQs(`8?vtTDw#!r&-J^LG+J#OYhBK)`vc;k)Uy(ejiTjwMHj(zM~j~MmtgfQyE zyw(1ZTjEDZ{|ws=xd{yqHHq>DU4|((k05bDr^aDO_~0+@d9BEyWFEGR;1yxxtMw29 zpKe~4n^#fP0W6SKpG5JLLyF4m^{4D_O3 zR=c;jB+Jz&hwfzh(@UAO-&?`nLbb>eOuXr?S_S6}c4Kp~JBxbMg#frZ0$sQh1x`HS zpZ{Y~myK&C_as+aWM1=sZ0b7Yb_K4%{cya3ah}X{l75s7iQ9(!pK^5$>nJfLOL%u^ zkj{T!%^78c(d|ZC9^9;VT7-JlV0aJ2W5}K((xJrMO-K%+)?hy*Z$p_DkVy|_tV3uo zIJx=Zg$l$ppvE!ux*1tYmxW;i`maln~y-prMz69UX+%DC{h8t<& z!>`-HU^GDsc(zj-$S1y{h44F$Y(e9$3#mw+RgG0SS|7*Pa`_$f(2xYS+u{429xT2& z0Dn6YSFV+Z$(i+a1TF=BI<0ffG5tj8TQEi?L(v9db(YV)H}9&Wf7QvlZ>)B_Nx-Hj zotyIYqV)b3p*9(|XyI)tkM&tK_{vyFzblG^XuTwVSDU@*m_fvl1fc3j@Nk6PGaUF7 zoly142dbvbBkrbn1j<60vOfb6g%>LowiW6rkC5q>!($Wtba^ zL)$P$|H8J)s+!A4t85;GHft~9lssEIIO zF;93>D2C`4kh|yZ@Q%bR@@w7a3c;{xPmFaFyyG{U5cyd(px6EGpinvH*jGz<>^}2w zH?!g1%w0rS9c14Bu(v0tHQjCP1Xg=TScf|Nv?QSPV-BPduGFxlZ+Ol~eP4Tl1&))G zljPlaIO2gg7clX;+I|lUd5imhyugQaNBMYWQu_m;pndM+Jom?ExkKN(y#3`PIAx4x zuKOmwJOsPVluE_Dr;|&o|6MR`sS!u70%)$#@~!$>t6hYD27_kmw@)Uj|MfV)yf)zs zmth@v4iLRR4)*q!TMZmnt`Ts1v!PvR^UrDi`TL)fWaJ-b|L8=nf$b!EGQalKSNIHJ zK$3QK<|oD#seqScI7ImR9-(r&y<2Xk?);CbJlx6cXVw^6^{KO>dx00?t$dsNH|p^F zI818Lg41yL{g{$%W86Yz*OA|LeI*hoQ*NY;vUR$PW&^3bQbSTj6qKf&`QH^XP~XVqfFh4(z=I}EHT6;fQeuje+vA1-{yIZ>kY|wXT$@i8SsxW4!XKy6rGD8rFVRc+iO?dQ49_ktee zwtG|9&D#t(??>-;|-D1W2MOksg>KqYX()umfyV5 zhK1VxLvRtC^Oi`2o<_wZJ$J+6l*i!SQ64Xc<>n#^gg&0620xD1Sg-TF3n32G;Qsm6 ztxMKqVPe$8=R`so220RA!Dt1;#08!9RYz~{!> z8XyP`Tu1CDIRZ;}|M7Nf!dBMyZ4U%G1uH7m6G_ z(NgPkG)xNEMSO5DN{(!bCkRR8+7B=um#z0X@(_x)dsR*BTzfQJzKT~+OP}TB&`{Ji z-$%~F-H#n*3uNRhJgLZgig#W}dFhv%=L(%AkwH&L@v zi4?qD;bbf&0cg5xu2S5L3HD zDYwC6f@ki1+Wzjmeezwqy+CZ98;N+DS@LM1meq8ODoZ*V0qnrF#Ly{6@F>0jkgyWF ze@e<5jjOZn>l?u=?oFCVTaC~!z#H6Y2eibEDEGG)Ei#rYPD>N{%7_5K5$O&iWdn26 zn)S}`jdc#4zUSN2c4~@Ecx0fGcdp#@L|2#8o-di~_lS*m-mvbBmPFE9V3 zc_Dk~WD)YYbVXM16_0*Mtb?uwIw5Mbt` zMr{?+YQLKAU*vaS&GXP+sC5H(`SRobR1qep*%$R;QJ>j=h8%o9nDx4Ye4WaTb=vDH zPpll`FTDb_>6cI*QafGqSxODhiOw^xo0lR17MLV4hRl%@)@E62qc=B^FBQ+c=t2Kg zsfF8YqK69zh z_xiz)_T*_6#Re<;{xQq;u6F?E3JM0XeWfpyI4(QL(xhZav%%-WH2PB+4G7EVBEk+g zW7G-!F7H6wO)g%-VLZeL>1aw{)@*RJmQ58PEYWUk%R1Hn+QOh65}!*0;OMX+(s?4` z&8Dng)dCe%Oo|2#&xAwfuYbQ#tk`V!`SMD%up|iM4~LMwNx06Q+#(aQ7g=Th48{gX z!;0z>S@@d2h+d%!-ZRTX*h(!CNdz14#e?x$TfE@1qtY!AuiXfomdf<~vyd8{NneL1 z+kLo7xzRZfMj=*PML*nK4WsnG(u2)B55Qo4k0*QVpB1=%YjJ|Akn%?87sA&)TTw?B z#Mei2@Ptaa>}k9v@|~^6x0C&Y5wp!f(c)ukEhtR!rItq;QG<+u}K3kxvzP+IS&+)IX{v-_^?W;Ky?MJ#)gC zFgWK*(x`h_{tkj@B6mf?A8uA-?h33{=ty&vY#~+XkvydQ$6i`T44fDa&zY=3KRM{eqy!-7KB;)leCWS*Mpp zd}zQz8B7r-W9S~V6D1qYuHd%~JKfJ2vMS6KsLIDMs-Fczw5~o4rwNk*;i!*23RMyv zG^hCX3E{aweFAKN;pue-_|WOsQ_A?^EW7>qdGsbh$e7wMG*$mimSqIHG?veU4<*Kz zllL#Y36Wbvua2hDpYKi3o{J}{1B+`iuONrVexQOPR&hL2^_61J?e0qaJ!ME!hkTA_ zv#x7qu~xm_a+6z6k{}JydozmNOZpYuWk-b?%jI`%i%UWw6G}Q__sVUHukI1?s`eM_ zx#O6%%wS1k?6I1KYGsTXFrp{M7_^HFs+RgUdqp#DD>$h_ZV&x^NNh^Z(4~(HO*X0> zeBDYQuYDY2Z|eG^P(do_{6*|Bk;52+&z6?z6Tz^6qx&Jr4(uf?^p~Mb@IUHv3iY*` zUq9TRLspfIne{i5e(<=TWgy-Dk=s!qkEBo+jbU59wB+Sz89LhMUvlm)O5llpC*Jm zD(nt>*%5|~_+!rQLrTplxZ|$ij#rENU)L7YyXI-HBDo};AJ00x9-K!9&~_tFOQGg< zC0Z%&D?gwwAQL`ZRitq$xeGHHZ}8*ENRttPDV`L{8ovVmuutClMDi z{}-Lrf1jg@pIjs{ zPnPLxZr*}46)dKh0(q0JoXki2m665bhsPi*!5pJI-W+4|p2-F34?J;LF#ge)+;6^+ zBwtkGt^h}1epAf|d-1>DHZaIitb08oQscDFN5o}8+b-7bivk_|o&3UmXC(dl?zlOU z%MGF8=lu7rF_q_JwyWg{nCEoS?^Y&yfdJzl&ugk>BDQUt)TY6?O7qbo4eptlxpcSg zA*lq~Qx+!`c>2=^hIA|KCNq>d2eQBwa*HgC7rDRBgKeRR=x=%)Z|uw;MXnh0*|IEU zfW?n8gm^Pp;q>7iMtk~DkZyG>Y{9q1@;w}fPC3{LgrggpZen+2TXuN}pD)-}Z{6*M zdvE!L(C&dg7zu4zbyEwu0n8OX$<5s3t1@fH1)UWVEH^mWB<2J+_d;L&2>Wc*VlkA$ zD-|aIpHyw(-J_V1M~eJ=MB4>+%_i}h)M9R4Jcd?LH0O7@O3dRMwRns5u26~1kd zVpbGDf5`MB+JbgxFA*;G;VoI@k+i#JD5pM(t1JV*>|rKN;1dKc@T7d0bYLaGC+jaHDi>#qY98|IMD@ zH!xNB^C{Mh-Qnl&m&h=&8V%jQeA~atIL&Z=Jznt}4QhL*!oAD)1f!qGq6#ym8Lpt@ zGC*^%-}@z%uYK1WvZ9)=B%aDEC;vtt*dEU`JIH(DsPs*@hb8el{M5gW5MlQRvOJ5+ zayI+%tURDPN9*+rB<2QaXA*!H8zftjP=tDd!0}1@qct5#z~lPv*H)kRE{9Q1!YEm+ zkjV4l)V}m~ztt-@;|YryQpvD1@7!FXcjAgd==^tA2f|);pIvmfo1(i?c``s?KC`-u zdey9rqKwCj5BnqHt`s<0l!3w#b8ObT8++kPw-Zxf*knLP&F>v&pfIqv>$$tDd=?soVIsQTQwQVEx)(j zVRdvu{OHy#q^4WkI(a$YB>*ss_kXOG8OF`fS_bFCN9*i}rRnqym&lO$;q|)1nDWgl z&89n>=g~8NmVf7Q-}^m)^CU&{QfOguQM`ub!R0x-QP11_Gmr}&R^8d*T}Ly-f!4i6 zM=(CVGL3wTpPjjZ#fb}bUCakhS@`N7K(~OHYV;U1pBWaj|J+k&PlXOV*A&{V*Spd+ zYkw(?x?cq7I$oco8=^dZj=%;6^r7Fb+K=vO6e`W^5~r^v#fybrc5C7=s$pbyhqRje zh$b^rvi=zQB;fMU_Rkk6j6Ukx4;{I6Q+-{U9GsVwn*K`-@e+oclQ2Rni!e>D0x;e^HsGr55oA@}cCMzJTyE6++t*_lAM`$>)YuAdei3L84LGSrRBpS+B zx;p#06^6^W^!QJKm(tSg+~t?C>VA+YoA|3RPPGbzJS% z-+ZoYN+ct1VDf3TdcA#vCUZt_MSZHeD@?}~b7T{>mFyBB@RJ3q@lGpgJk1}>ldB}6@z>NYx=-L%sA2w zWWHMeti#&zc0$~&nf5vUtu9cSD0ANYHIOU_m#>uF(cp9*0aTXK-Ypc3a2q(grwO@H zeXw4?<;=JB*xy10rGNBWj;{+%-ZYBky4(v5Zp-f@TzjHvh7q@|jB!$j%CvJVteT*t z{Pt2T(UvBj%vyCmxC|;QgXWg+%@qg_zfFl=TV2I5#rz4A!`H#0&&yf5u&hrtUQ zLVvXVYk%uA<`F(iiX>=G4c<7oxZFJou-x!}Df%`kZ!Tu-T>7;@1+28b197As^xP}= zXW6oclQ~+FyRv9V!`7S4tmXQ^G-9UF#vkT)xi@*YLUJ^rJE8BcUJdvDGQD z{)CWWG`Y63l>VGT!ix1pyNB>KZ%mGacQO0aVuBfAChkF_`PJD)9M@Qpi$jIQ?AX7H zJ6&h+sFRji%qjsHySI)34u{l-xQ8l8loyFTmjdOkW88( zxA&dnK;MsvA5Hv%psdXP@Ee<)d*|)tel}vcbmLi1+|7C@tHec*6okEuO-{HEYolGx{R_Xo`xd`r^oal_TGc!<>SlD*VC2@v8zI4GT04|E6DZ-yuaTF zI%j-N=8DT+MlUF?`Z143JXIo+4&%kRrEE(XQXcC#1>lTkXageKcjD>QxyyOA3JdHb zAuG{fl^>CuE<`^?nOwBs0WHRykn3A<}nc( ztJO7e+k*ulW!PMLPc4&5f=O|G@3zO%argVihRt}(R-mB{9+MK5^5T>53m`4jlafnx za6k!TJr#1(l*Q|C z9k{4EU%}Yc+-AqJk%*Z?Fpq^DNRrkb)7MYVHqARi2(fV=3-x?liqnL?Dn1dhtgU>MsN>sEcu?v^(l@yvx+Lf=5X$-t=aUge;{;!+EQoqUu;~qFun)*5_hq7 zup*_O?)hF|Pc(<1 z{li}%w=C4T5%jVUG_LNj-^M!_k%rA^Iap#qiLPP>4<7sB{i1 zo*q2{Bpvn|bSO#3c*oVD*b5D&`WtslltywpKg3GfjiIOEph|>cH!x{wEsm6J6AYv@ z6Wt@CN!-3pWHQI?^aok+k7daliJIyM^0oUGFYg`mx$j`ciP0QzvhHZ0x&#i-v1wkw z4dHxo4WiG6e=n#yriijN=tqwmOrdg-x78De`U2@q>i4|GJj4vK;G6ai7ymd9a@DH~ zfUTTar=+BWl&l+iAJ{B{*Ao8sz!qb&cE zB{cZ^|4%xr|HDlB|IhDy#5`{GzGHO0zpaFQeYz?Dex>y#bFT#>py0`V2k)Z~vBL#y z<8??a)j75Q6Z>xh7MzPWKrVbA<(sD-d=5BCijiyNl112WpIiZHT&~qSw zT>*mPeD(1M>~1_OE348%@D^{ZN60VFE;{m~X-ci{zjcJCJ!4Q|lw@2_hBrufX#O#8 zA49M8>Z$u7s@!|oWAXdJ&3V1;hJ(lcfS}D6#aR9JoS`?3?J=$4e~QZBtWIPvXFvcPH=V zisysz$7XLSJMPA^Wppjc0RQ0&A`Wf=Lx&v4%8W8wB&ILXi|6 z>;K|rjHNGgH6WGZ7}bLWUF_SR^flh;SOLwh-s@Y%H9>dSwNwmkq``Mu5&R`e$?!K% zcK{UjV@vMu2HxpDi=X8NsqwZLjUM+H?@4SEYve!16o{ZwrK%S{??o+9FV7KhaGywk zf(-x1{r=WV&;I%5Jg>_Y<11YHi}sE?2)~2Xde<=IYp8PB-B%|67U2Y~|LyUGVcKrd zDe>Zd|KN0*{t1NExuP3)b9M=&^UO7Cx7jbFDpKf$I@A_A{+^+9&1T(7!;qL`v z+CQjkqx0pSzmSa(bRd)Lo^rSZMsm&Xu>-Aann7@BBm>7PWGtK^lGoP&cC(8yi1Gqt!Owa>jdi$A&0snGCQ;D3vC3 zFPTQe#l@AUR+f>%|D@rW{prKn&H46BwIvyl@6rQx-qBKHFf#VLfH!wBKX1PQ;57(D zrq#2}euDQV5nvM!l+!Q4Q|SBNbWAPon&o$hzeE^GAh;=SEoMqfD=Trk&)vWe z_3I4AZ*OgF{kL&9TlItZTcsH_+UZ>>el%*Lpr8OZvDwq(Zw|C)|JKyh{4tySJJaA) z-cVg#jbDFB&S!r>OEF(!WoIW2=0-~OI$!liIp`YGRHu7?E!N~6NaT=Ap$7?!T8G_k z@5}DXoFCieTlz0}e{J;40Nz9(8Wv=;J(LbEbq&24oge#Jz+mD@Pi~FIcrfW}r^_?n zW;<|)vdd||=;&)N_n9 zG&KEKWY;x+usipYgxu=H7QoeKlZJtu`kaKDo4a={iA^72tuxra$?c8;M489aXu2A( z2C|botAg_tdc6E7Z_fzN@2e7w$0Vk}hN0T*?uOh(_Z&M&Ut-c9os5~31E&W-+1KGm zYc6|i&98I*xs^d0m!%C3-|Ee4SDM8dh>UmZ@Ot&)T6Qs;`#k)F~Rd&5b zx6@&Gj#9;eMh8gea@dfCF(HQ$jG8E*RoUqI#nGaMtvUdX*^CAVJOq7s#aO&Im%ZKu z89dP1en_-$Kh44!(0t&47l2q@9bN2>OA>V-%$7$|%isxzUV)fPG1BU}froSkC|%l4 z{6m`<#yP>+2l{~@K>NKZNO?rR0Ecd2EFJw$`~h}zIxPIC17N8rk)E)OG`f{p>y|QT zrP*Yg7aSs$0=q#EhDX{gW>yEkZYHh5A5Wys=Enz(F=3CpLvA)%;g2oKHRb-M;1mJ! z!7nSts4qr3SP*dPB0H?s0qCTE%Z-PF7WMEjm8W<+wj@y;uRfi-4q(kSxz*cxj=6V+ z5RN~=QbAa1e^^og{utPxac_W7m(T{|fEh>_s*i-|L7p=Q@uN8y`d4B(>S*XCuUUP_ z`CHzF#{Xky9u~5k)CTe0H=8aF1yQblPx$a!KaciPi=W(U$|Z16hg z8BzLF&cs!%h~4RIgT2d3P-rSuyqQZjMALc>PQ}|uQY4oCE7fL<->z|XBRx1Cs~BWc z`+vO8kDDLY!10M1lX`wUO&7|T*ZZDbLKx&_s`0<9l>@@zaKNEV5r&2b%pr}yI{}f_ z3yDc$!^6jO+a|PKT3v=3Ogm64@V=LU)AS-Xx7Xbr%eFq8;~dpz+#1yJ)ka4*NaVIh zV|^xCHq=7TAIxJe_rc6qM5O_nD*=pGWB}U%P;6DoO1J#=3kfQRcCrw?_T{HkmF?GI zL5FDP7+!A*fF*{PqzUx$&*f=?eAfX8MK_8$$ISP|{kboHc@%-Qu^C}O>#v7J&j`63 zP@`XyzJ8BO|3&Jhzds}6Da%{ z*%H`i9O|&f0FA}n7Q*`KDmm_JClZMzjY>1$=7(Du=eVa2_6459ssf4qtFb^Lu=n&S zj`8L*uG0;z-Emd!bnf)6Q1y(L^la=QaCp#}p zZWmu$fzzw$YGf2oCqj&%2q1a@=TX6Va~!g&oieo|+hdGYCki;RON0=Re?3&U3ArSO z!M%=BAa)-N(RT-a*PTG4iuimWzHGp?HDuvhH?2k9*B6E7$Zv=~^AwieTopme&q`36 zacvw<;StjVESp+U!R3?K#|gQ^2!Js5$BUiO?2}jBVQzz8Xew4oriDO<@RV9Dc>J$M ze*UKwkBhL!!@^ZAllW^u&tA6Zkw`eP4WbAWeuOFKSmt<;FsKxkV6lE$@e|*i;ku6h z;mwK9leK34?^#j#rqy2&v`(Ekq|SPe#8+X{AB79#Aa$Sx%DT4F{+$*81raq}uk({5 zYy;?_h9dB5@PCgq1nTFpY>Z%$Llop-sw7;N?+q+LLTpUlra3tG=^y6_R>^Q%-(O0C z&b37u-`LTyreDoYf`f8*@mw73`(IuBE5cv0N!CGbH+tCaO$&+Va7-es20m?jg&BdIv==jMhLo5(Wo2qG6 zq=Rl{2SuyNRea7=$^%a<`=!Te8!kw$9L^jxTf@mTA0>pk9fBPsOkUwJQjO!IlCZz% z42qAEI){dMouacIy6tV85(rsfJPnr!27^3sZ&32qmw#L~Q39;e7|15`qUS5+PR1yM zPWu&Drx)2q`%!89N+Yc*C7q)GpUEsGr0KBsOAF*EcozVi}j3l?GXJS*I zb>IB1ipgZOF#YtdT@S2&DZRW^c4y~=GiEcLDNVP<8iq8#3=G*p$NfZk)aRd`_Umry z^A~#USLkFyA2KCE4QS7DjhfP}wiQh5F|4af5JHpEf4HQr`aU2d$ZKk*u<6ekgpddk z#M0jff+p6@zUImy82UMxJ5zRdY-c2&_*;b;ex6c0-AeA~L|Sc=>yuTnmhzg`JHOuT zeuv0_yN$uKQf$y2Xq1`_LPRZsfsG4kae%WvN2slpWx1KOEYCFfD0)%dno)@`dI^K^}- zBjqL-lR@R+8K-#snfV{jW}7>_zc6()=<#B#=n5Fqh&>QpRu~(r)*UPZ#c036PO;$a zAG6A44EC(Y$=m~rJnd=(uEIM+An*|X+c|BmzF@GNkAIu^jo{CJI1VT~m=4RVdZ~0a zD!_&YsGJnX|MgE~H*_{;cI3(fw12;5kNI@b$0NXjx^2Ax6g!QA8$O7_uM_0r?y8rw zKlI0WaIa6~M@NNqQhZQ(KFk*Ek474m&p;;R>LYZInE7dP&q$jo?@5ovmDhT$>*Lh{ z@9FW@XSJG>WAq_-`PHG_TCnTr?-Pinzq%G1K*$mI%3(A3&oR~#xj{UMYFfpHK^m3({90Wx{6`_)%ly4#sDUV8bZW@ z9GNLrb^yi`&(3v=LfNl%hck=nWfYxfjdx-QN0Zp9n{6m)8 z(*qe(LTwXg0gz`-JuvqJN>|;j0rfQ+Wir;=ULNx83AsdiPCO=cUz5>HbECg!UxB3% ziDIUB_f8qMVuhOY!BQhJn0<`#bFLL?AWW4qB_SpNo99J0ps(s$~;}5UFKiCto!`fm#lcRMJX?TFtn#HXEYL z?rzyN1pJ}AO%UY>mD_TnxFR)kAD7p-M}6GbLJe>a(@Sn2RKI5bAo1|NplI~=L8E^) zqwpc+IYMbDd}t87mk1vL-%BJ`*H4Ha5wcX_cR!OvVv2~HMPO#3gy4u!Nm;J)4-Yl6 z`HV!LcHKNq)iLzeZXb-l;2!>&yWEmKm8H&b=iK><+iS5alz1_7^WrrW?1#n~k~D|4FhLrEI;HlBOn)H~ekpzckVgU4c~Gl2k#_^X?FvGFnHJ6eJy`ZLe8 zE)P?f%RCp8=AdoT`}`uc(u7q>=;2+vlWO?p(0Pl~TvV*=yY^d-16UF*@b z4h9iI;eNKUW@#p6yKMwW5#zDV0Ht&DeC1^$2OQ)oIY zF88r742l^zt)$N)vi=zjnl;{Dl&flcjH4VU7RspAa9UjBwUSDI3eEkTT>USgCsHL( zLHBMv&3x{6czjmPiaUpiQ?oSDaWNj#z3xEf%CnbzpNsR2Tg({=Jj9+qi;C_d-@jRy z{b#&=Y(PcasU`>~^lnQp=ziWW4*H07FPU}p0p|bmEwutez1d&nqi3EQIg*mwXMzx9 zDdwaDCat=QM`e$BjHTz#z%L{#ABU&kpS>ViC4Ktc^O>Y0;?p1AwqX9>|0Pz5DEQ1X zr|aE^)7l=D(KPaNpb2+7rfN(`AmF#z3V{fF2&qHio_+Wv0j}aDBRnsB9Vn$b8!yV+ z-b97}zRUScTx8gfH&tPZje>r>S`3T|Zno2yC@9+En6$U9P@g?xe!Bjbg&@kza9U#H386@NBIKBe)uc( zawE0=xWU7Y!1X3h%aA@0)bKUx?IFbTkeG>1kIo=mK&`1PN5$v^^y6qQhDl>eDSJ&G z9(>Ht(|W+w$8!OFvb1JTctpf`LFj_{WWjVP$om>abCp{CHv{VB%!G&~EBSH90ImcR z6_SlQ?h|fg&E^eLs{3cfi}j6{`ZTiR!+WA+&5W^`oAi<_u{M>(i*wa=cjn33?l+x> z%FFxNn-6w!5Wd~M(WVKPw|soJQOs9!a2ME+7mXfnv57MZ&tgyK z?QSlk_Sb}q0rFIn2Jnf^Fn(R$xwJkgnQGfUrp6FmHB2;Qh|=DJPpHv>@(c1>74#_m z(Gh!z?6C=pL1S8ARd5peE$0k|eMT)Hd0)6bIeoqT$EJDI1{`O)mri6KsxOcya^=ZT z(9x?QE@dH8r&a(3i~otUu(cmaG1$TV+1Yd^hZIr3O+jqxqCY z9Qj9t{e{|KAo;lHJG0pzU;lH%HfOErTu@^ zcCJxLW^EtG%9_!fn(SmfMN?@_raU$tYK*X~9J7PwAw`kQ9M6YTBt^x?No%sQrcoyi zEf0A>KvP5kH7&Iy(^5eNL?w?1h{(YM`fg`FJm21L&t8kQSc`kXy4lyg_jUdL|GoEa z>zD^bNX3hNtZ9NTH%2VZja0p(riu>4gZLfnq{VVdEA@77_d(+|87;S2RX0D~2yfgI zq~c@qXt9qy(VSo3Hd^(~;Lprcr$5^fJpk;1F~S51Dnh-&%@;RST3il>LcYcK;@~IPO)J!!zpw5GNw}-z5IF)9 z5D1b}dw-7Z8ca~3aMP6t4-%3(^5}Fl3}C%Uz7SgOT>J_2;dfHi7;}&EQ>e|#a!s(x zRJy;+-_BH5Pmj9)T(a=drBS7b!{Ham2c_VN^%<0Yz%vCjBR57&r~@I}XB{NTv!JuL zC=XUi?>yz>{OV%m1(J40V%-raNQH!AVTnR2=tOPWLpyTe%d1P{Yy@Vi+FBVs z`K2yFyyR0{KldfgGIZ*$8}qC^ZtR&qFIF*aAR%;e$QwV$QjpYkP-1=@4C@|Hin+l> zOgYh|^s3jnW(6=Cml6ZIESwDQ&}CpPO#fhfRRcrt^Q&m@E{?{)Yy>J{j&#@ep1n<9 z50sS{k(5glAkD+ajNr^g2?MoRAWl)gx|Bn2-zRp+FGwE%xDME${MaT(Ke`^AP?2o@ zvxfW{^xTRia&@p21(`_mM?t~xIa>GsoNYG{H^0m~!)y8t5!)0-+cDej0n9_Z*L++IHx1T9#j&zzp9@;m z?1tybH>p9cr1KrwAfoYtFYY?;(obIfkm1pdLsW-aGvE%>{w1d}vD1CSYl>IY)DYQ7 zi}#E0a6r(M$#H0!S6las;hgQ%Faoh~=gFJ*{8-r?^~KxgztwN`ln&ytB#ll72)-Jv zwF_LZ>_t3?V=?>&I3Td%A2AxB9goI^M_FE#7ys+dqO8tq0U8Kk%xQ=^G)J zef{E10$tm-I|98gj zFJ9;hW=FGFB-^`6c8QZO@fszU=)Uvc6^$KvR-Hyyaz~CtEy490)Hg~+_F#F>CaL$y zF?y5L962kIvVN1n)RBuW-xiuJmmLOGJVVc5HrKBsyH~HROPYIRzjtg=ZvDbkaM%e5 z#Qt2Kw3@jVvV7?9$yW|g?ve7$Fd6N=u{x-3Wq!tk8BuOJEhtwHkgz5?N%PF4GRaq+ z!Xvh)O`TgWX2S4^+!{=VI}vD(W%%--;)3Hd86fE=O&QMEL zzcz87*G2LAqA?%RPer~H+m7;&?g^J|X+hn-{L7-Nh?(n$z%R#h3>(g+jd|Zgh~u^$ zH1e@w|CQimT!lu#ZU%c{Ii`eq`9u&n-!lrnZi&yQ{3=aH(4*X9brIeZdpOKmPO6?D zPKt__uL;=GxW({7-0%sxh4FgwJ3O>koCwgiWN+$S)l6pKji&Bfi$I^Ja5+|jcwsA7 zv@j_ub2Y9XZ9$Euh4AUUu5TuGCC+}xcldN#lqZv5mVDM{VZ5EJ z=Z0yb=G=;AThtWL5hk=}d~23iqY|&JUE5}Q+sVV<=FgB-=JETMWg-iIbSAjaZ9Fg1TC<8pkwY-#L)E|1pXyP#O9Z| zkb+9}KSu85?tgxpSkdn0AO*@66C&V_b)bh@FLO$qx?lTWX_z60@haP(em*|@5a4WL z|HLeYD|))U1M2G}!gt9fT^^Cx2{p`LX-8pDvf~mj=$M8Wu<8VpI}eP+sbj-qp%KF?c-dG=!!{k@?rp z3R~9m6Hi4-$tJRUYSdL%_y_9E zpVl?z)Sk)vH50EV}Tsa$Pw&MnqQ^bF}?Yw-HB}%}`{*Vmjo6SwdXn0Ybo}+e0 zJjX6pl9Wi8(bxe0qbux1SBc~+$6>rk{@HhA3OJm=Ql^K&e(Xfbb~5EIy={?o_-f(3 zoI51D^jtDaJ=IHuG1kjU{eMFiaqP}{7)g;gS(%zYvi4SCtN23wf^UUNsbE}N?dJ9> z2RJM^N$e)=O24`mu+l$xNZHfmo^tC1kcO#+fau=a6=Oey*wNpk8q|K^NW|r+t6I& z+umu9oPKamhsmYa#$PXU6@#(7rJcCXccOM>k%gZtOQu~)lX)0NqN2AM3xxV|aNsBT zjBf^T1K9gbEKK3xYvGlS3GiyqdY8H(nC&*5GIiJHl znQIxnat63Bs)1JIDQJ+=E7?IZx9ghes+O?k&%iC6htTmgOf}`SBU7veNmc8_ z>qNJtWzBiFFyFkgdrF`(=Vt*1>&$)s{Sz{4E?HY1+SIHa zMgTd$fG#Z69cVNO!L947fo+3PIT{o}F!QC0$Rs!S4#zi{uax7@q_8fltdBj2`1RRfr*K7bDR@N3s_8Ja%d^cGCcHSJ52 zU;N6!Ca+oI4Ui;(ZC8uuG?e|{@{n!{(8#(q3Cz+QDzZx>ETS0Iur#H$ZTIH`>*yll zM(x?%MdDr!JW(YU6$$H}iOF{#O`iXia^*ql;ehGeH%LQPwj7V~oC;=vT|xc6+|leG zkeoQ+ngX-VX55PAi6nNOQ+Be;f-QZoa5)BnJpOa8jyfkdbOZD$r2Wqyw{O`pf;)68 z{;C#a`SDgr?`#~r300lgJYLnowaDwGOEp4Pd$hv@;w;<*d2E zU~e)5Inaxw*sXg6MSQ~KaYS)z>d0oRSZd2mAYcJkk5ZBGir%bMECMpWn6Tug%LMB~ z1AcoUNq;_=`EK=!4pWM~23Z{t#)uI|c2>Zg=^KSTS`hs4Rc!S5MxgtLR5Oy2p8@&5X?y8=jh@w-hu^}{ooK*0?KdwDyktFS}=USwWf83Tsq>N?0 z#TDI!)~uRH&c54c9kpO?GV{+Dr9{dnRfpFhf7d0M`@DGCfe#!_o)ihY;4-2>{eY8K1nqUTUEnniR@OL+zKN+xk zcmvaZ1EY~UOomF5aV)H?ta+o}#*ZVTqtB1Sqoc?Y%+`qdviPhk)kbem5l)fY;*lct zq+<(O-pws*D`_h$DH-`e{r&v~m*}R0zdr=_G9mXcf$qt2hVf zu42aUfyrdbE4h}&sC$0uc?tVbzZYXy?&@e*FM%0haCPpJPf;;~xDv6`x|%5N$^_4ZEqGWqPmm{oKP<{sqM#PkSj1IlhnD} zO)zLI{|E=kUe$c;hcSLDAoo!*j(my5 zee~l6r=&r)aPyrE62XyHG%AwGJsI=Zl|CK1ncYL>&|Z%~1}jlL+`2!kBOVz)(j-++}XHKSkdC*8SDKsA2=}o%u2HGe=d8Fq5heMQR07Q<=;#E|FhW( z{jYcZpG&U)de?t1_2GoE9S_E1O*7s3Zlv;xiX__KUM+EW|MFfwz2xLtAEZ0V#-hvd zeJ?I4`67l{SYNLq%e^Pwbie%utdLwn&zmVv_CFGo1my!8-gnN_w6s%>As^nm->len z#51{ZoTMM+PC?gYbwdyE0 zI3p*xkGKHGU(fbkMMeDEw+Dk@2d?yzvNC(v!!Nm8fvD@T69P~2;Xje;DmyN^XZD6^ z8KYBEk0(EcdZigEA#P- zFC!{DJG;Y-jOuV%gAO{+N2B5RFotuVT~K5`XLf`#(Qr+X66J_Zi#0QM`CQOp1W%!*?Ji|zx|evdo@*|6KvADX zLWbJzUR8BGPYsv6@4Ssy8_(Cgf9AEemM&=uxoy+YX=NXE`*l`Q(P(5J(bCY&Hsazz zr^+%u&lbh8oo)rYXls{g+|Crj_vmxOM=c!Hxp9l%{aIp7GuC(Fi=%iU)%AI(dcIvC zdVKZ23pB`N+Hy@{H7#gY1qH(*?GI)7zV1hkIXqp@p}|@_JZJK`Z<9O(Tz5UI9Irpo zu`>*A>|PHKJJ9boW!?{lTIGtCQ7-8E7UmTcSmJv@_wrjW8zsphePOT8{vhq~j@A3| zdkU|sOO3E$Y4e3ZMaQe6_XK0>Mfs}BN?>{H?eA6BjkwtK^j`}XnapZ!EW_6<3UD#1 zR3(GxR$5ZrVhu>K`dP_GBG0Wa4pZ%Qw?6h!|aTF(~0$~F-~7`7nkzZyLG&)NfFm3 zX$1xe35n`)o|7SE)is)?ChhMjifM(LQj(JMAP$S!$E)4AipBuP@}JjlS`=x;DO7?T zEm~fea~`j=N&}4-O%4VlG-xIW|1(vY<_`X)f`%&bId4OCsvDZo66P0s&uyeCLy4$=S|>I_->`Z`1;J8h{apW?(Z80uI!Ct=~TEIAS?1r2+I!l}ie>wQ~<=|Guo7n1IErLIv) zXx}Svc2CLTuxoa-;_AnD(W%6sd(OBs@J6!Mau@9n9Xfp4Ao%;?Vk`7~>iL@w$y#&MurU6 zm%Ssn3*L8Y#Sf9Su@x>W*1FtBd2feRp+xPP9eCRXcEBW&Y7+`f?CD!15WqiFR+Z~` zygy8TsANC3#LIW{*HzS3Pl-n3S&WSCh~(afxQm={SV>8R?-nkK6*CL(QyU!|oMt&3 zB$bml_8w(^8oAgi2?dQhEIBWihmFX)^|w5vMl$m_3f2%ZZaF0 zQtpaB`VQCtH0r)D5xaXyS@sFFnns6{A1sjZ?R)JRB9+q9S32l^)BTPCcP4i4zV((J zsXAFGoKGhXI}r_bl5fs+Uj<+9wUgZEr*T~s6k;@L8de<=X!=5ld}PN3-w>7i<@j%Z z%)7mwzm3l#yRTTI4z0NgQg}b!xTcVI^=XqYXuF2_;HQgR-korEekwI>3}_{Z5BSza z>rZv2TwI({(ymh5%6rs}Zno^;}P+IpP{6!kado&eOSWlbBKOwWRWJrL`- zs>tA>+h8tm7ZHDIX9%lD7(AQq9%^x2F`^4IT*`GEANDw#T3rkGgJ~;Tzco7@6-(az zT_K7s*KQd?74*s-Gr;%!Q-#HbFJq+Z`%<*8X(4%1)==e90<3Qkhw)fCw~Wi8KIhQT z(XU_@ckdNz*H{^jrS6wCb$$gm>h|X=CHLF8gT2b0NPGK^L(pj8jb;klk#5A(g6_a@ zrso+Ws=$4%Rm0l2#-@gDiZb3r%ixohIdUgZ=qS!xuTya=q6A-KTWlyFHr@#Bjn6wQ z27mqGT(Pn&On;+v1 zuW{2sW|}s+b?U0q%%^gEU8QR#_uEeBD7y{e%ygeeca6j~yB=+aA@YUhS@OoK7mKpm zHMgBLi?2$TT?pCTKTNZ;_1Z_STbd2!1_?d?6zUti1Nv(AjY1`5mREgVUS4tjQxB@{ z$W|y(NnJ8f*M--rF^od-9pX!?`j<80mJNPd@7^xxe(!sJIu~q_cJapq0W~LEl9QQP zV&dg;phAP{?S7|WIV3G1AwPv%1M)}C#k!|$bN7RSy5pGhZn`7GySjn8@y8Qex-^^S z{Fb-J>Mppn!?w0@K5p)&>Zh}(!Q+yOd^k8F#*k6NsI?OIKHt{|C8X!#V!aJ`5Rwp~ zVSoCfl|-8G$EEUQL zx_bT4xqE%GwUDv;`v=DYVg|-9=^b2)b->F6rMj=5TeMFXQ~B4*^lLbuN#Wh|Pn z-veUf;zYzS_v#jO?A$4IT~_T~$I#)4yy892Z#Al*u6k`+PIrbE>TYa}jq~>{uaq=2 znsGaJ`>Y$*rnJ}cM4?2PeVM;7f8=AtegN^wWb*u7RI1~)Y`f)9tbjK5sL=6d&kzYg zP6Dds@2ShJP}>op!qc6Ux5RiX47RY1D5yYfOuRl3Z4Y-VFnAGp&#QGK2cXrO>rYL) z2zuQkl(EZXK3DvyTX>bNvMEkHEmfbX70M$o-_L_zJ>QrAhG4l<2%QO7hVMK@7chVs3=lgjQ-Um4HBa0h1YL9 zT;uN9-2PBRCwM;zQHOfNe0g(CTcu3>ecVJs-!E9kNcs41u5t3fV>?u|{cXi8-_=Iu zisXO#vqOI+cGLO=Qj(WG1TvTqj?vw3d9C@xCi^^NjP{1f%sz@?xr@W&8M}pz$Zbq& zSF9apwMuv}s*406Fu3thSFf~r(;w_F3qqoHyuv#TbCMI- zG{}Ix=p?LnBI+UVaRR!#c&YeQ9fJekwU}a=qlbL4OLZ2_Cau23(eyhwI8@@_cI&%A z@6pH%RE6WfyhjveQL?Z|y*Zpxde47@eM&22jn^P}D#k{MjLoFRii}C?TSfKua%X$@ z!70Y)1+KVGLJ$h0tU-Y*uBcM{ZslCQX(uvtZtbfWo|finl$WGdUQZyh+y?BjBu8r$DJiK)na<;vRkt1fV--^CKemlqCLS`;^MVw)-Z^EC zSG&Hm+8$4*3+B2Xk>^x(+{gFArcJ*6%sJQ-S?4B@W`-KF#Auxzf9Lvg>uNs0m>PVD z5aPLU0)zRE@l8H(M{|>K%dXk?&3|0&olEN;pA7yi++{j_}mQOsB*tb%S-dmkGtDwTBnqbCg^ssZ5n72LLo%p^EtCrcm+jt;^y%@r z`m3^r^>ykUfn`WUtOH!j(p*@^)7RRT>7wZMqFQ&w9l@tRQd%Y8 zt`A3r=WILrXr)d2;^5jio6l}MYy@T!+A?1LCLDC*6M}Ez+FD%!?1$8 zc7_pncNrdZ0{5HNs#0DyVQ|8hIM}3zu~Ov=_n98Y3Uz-r+;+&_w}Rj3Kibmy!j$0d z#=O1W36JsCK6N}_!P`8^BZ(lky+ePs3o&85g_<&v8=!6#Xg#T|MRo^fgw1za!s+b= z)frYd&_y<{?RJDwFb1BmQ3)k2}?pExu2VBsg-e zIaM@oP4J!@Eg449@1@xa%%tK6-vndn6a~}jKv-zHp;uQs)WU_foD7misRnLe4iHUd zwlvnt^52TPzGyR)L*IC4H9hCopR*%X%AH1;b_eXYig|c6zls)3*8g;VnN>m^DQ~f% z0VrjO#sfCYDiJ4fIZLjiP#1X6EqrkK+>$ZfTBTCm>;|S8{Wbu^Ugom7?7DHfS2H0n z*=pC1gLQ~D53oe#v_2-LBeQRsC*!=fyWQzqSH7vZ)vcb}e^*;>$Uh6tI0^YQUk#__ zX*QgdM{kVaB$zh0uKbqK+jYG<8;m)zC=}(v1iBICGX7;J^u+JNh-a*`{=LktI~Elg zstK26bGfFI@_|~%b$~DDWIP_C3d#7~9NucRH>r;>?9*hgwjC8b&EnzGSzgt)LcJ>P zJMfW`YF}3*337?*^!9`acm1G4_VfO`)W0{-Hg}j!N3MfxJX|e;x+c>PT2kd#af3KC zwVPF-EOtHhgLau0&@0UJD5r(KbJ)Ae?>z$*qIA5q)Zp)+LLuZ(sALYk`1@EE{H>Cb674Q2yp`~g^v0Qc z`Ct5Z!~G!yckeVLL^MUPhf<%d+!{!(h6glefR zPztK>Nleedhx8Y=F!`*l#S%v%+oCiP$*sYzBjT;^wo{aemnGXUXA|Z*0C1&*Zi^>SX zsd3>m($goDRJ3bsXeuiw@I4;pTBN5a07~POfRjfcgt33D9+K+&t%0SxNbid-c!#s5 zgW~O+`6XN}LGx*{hr`^c-QT&9u>e6-Yj!>p;L zb;NYLUAiCKdc4j&(YLl6uTCL^X!5&dS&3nP`oe6)Zyx#9b6=N4jr~De`;p9~PG_>LSMh~#8NDz^r&L{DOa_#)=Y@v0p zYnc>2_i8chxpyuTAj%w&@o4VyQKnH zfWo(g)YRgmEwwLS0-i5=P_O#1bO&#aW|XH;a9F-3kPJuO7@L?JI|DtuicYOkXorB; zr4*OW_vJPzFF*f?x%>I=iY<$NH;i&Spk6iOvbK5L#5L6*?UkGh#M%qK5YH6I%p9x$ zY@hAS%Fi1j6D`k8=Oj}{* z7;Z{;b37+?etkV5IO2KUR(Kx3wzs!81j1qc&hYYdx6lX_>f4k?L7n%zA=6oczCPFd z08*0kMm5dyu5Z^Bmz7nS7!!E# z50xGU2BOW33@1uoBNrf4euo{q?PWKcB#sye`x=4vS0S6KEyHD5VuzI94MV;eo8Z%@ zPpO&wcdJSXPXPQYzAWo_ee`&5+ZTdo^E>?EtmS-unF%q8i4Q4c6p68w7!YzxaRqz3 zZUwRcb?6vBh21ojW*d+vzwLE;Yp@~!R7F?06k8sf4Er-S6gJ(~L~y!Wd4}t{y0P1m zaZ-_j5K*xGPX7=<%js9F8!~W}v9a*QisE#(J&ubCZ`{SvMW=v)t+@PqvLFYLH4c=9 zDN(#3K*#|LS_$}oMqptT%`iQ|T2|GMYcZ<7_+bDb%jK6bHsxK!*z6YZ{b8jMe+Pj6 zz-kEBh1gHMUa;|&F`gwGtb{EiC-~wNr{pJ7p18aV@-J9kUZzl0vUUhBJ_4|-r9+(@ zzYj2U>;%BpkST{-+r?0dQxcGo)#v0U#|)=uS?!@7EC8gfh9Mky8QCq)rsd}TbBxMY zmk{xp`W2g!%kd1}2J`c_JDcW*{j{|7YFmf!`XOp9nOvn_k5f!dtDFvr6#z2kE=*+d zYwnPomUFD%3=sLUHKp3lL~ySEKB8;~B);k)>TfSHt$?f|I`bKxV2aIn8kv9J;z%Zx z(Brp&DqG_KP>E7y0i}(iaF-n)&*2GB1V{yvSVdFQ{ePM7t2eIW7^I#RDHU=ktEg~f zxbLUP0H~`fM8pJt!#GJhHN9$f7dgEE5dXbHl;rr8vo3*+XSS>#N^tW~zv1�Bq+r zYBYr3pa&UTH_^z1#6%7P$5H0T@gFOJZW+vl=ZS1M1S0MdA~W%q$})4GkTxRpoiK5| zB4lP>OD$5_t0pXs-{NlC5b?QH&L7(+Ii7l)lohTcocXIl zHgMK$0FNuDQ6b2cB`HAH#gd_-D?2A56{wGze9U(K`6 zYge9!%ofvH3%E}GxgvXjTAx2g8)Qg|e-!msYj)n&(mpPomVQ4R%UDrsn~JLP8*joqh&>#OASZ@+Bb$^;6f*QpFz9XY?Y9Yk)_|W z%aapmcPb1S^TZeNWEo@cc@Bx)4|76y0hvl+FL=63`voXB$A7nfG8TK{kTH1Iuy3db4?N4XRFn%ip&=7n5w#XsC=FrW0=h}=#Y40A6 z*J>&)q3u>mk0#m^oR7MA7jJRzsK05)|FjOLwR7s}W)9n}4|6GCR<6H8a3L$donAQP zX99=eVd-|)w`le*Brr(_+?e;Ki=;FyqGoYXxi%rJRo`xA!^*4Ww_OY)0|viW{uFJpEQCP?zd5ciPje>r*`F zO~z95k8)Ac*#KRNNJFV$1HwE%Bw{-XW;oHU%Cs3I{hIXAG!`;22?6_;uLT zzfL^8b8=o(<7W@vyEPMniCI24B^l3$dPCeipQb?{-*tb6=aHNijwXD`!`gw!&EIp> zm!858oD!n!A#hz)_{2m&@z6G>qFJ;6yC=;D3KH7qLmy6!fqd>~{c;U)S3n&k8w@TD ze*l7y%efAX%ySm{S!_9)TNEmnmx+)`-mL+fay3c;#Yqc{L)dj#5qaQFzMDCj{3cZ| z?EWNHNa*r|e>ru$1_s7#woY!`MHXH0BNhZihv<}AxZtO%fS5DpA&B8=)i zM?@Bvh-d^h2G4Ah5O?;WJPCaQRJg#jeRrijp)F+Hw)iB5e=JDl354lO)=1>GQ2&g= z2X6T3Sy?;D>G|v3Q>m&)^!iiM_2M^gTkt z@fJBY9MNCK%NKfD09>*7$}}5e5~^v_^o<{JfY7tlBM20k)4kV2l7#AsDw{N#9-JY7 zlFPbjYZkkNZb`>tYMxqn>8RunDT3CX!3x+X)m!J8ysQE_`T0I?Ee~tWDW!s<6k&1 zRfwXHFM@|m;Z=bOdNQyCQ8+N7rv1nPpMJ6<;$}dllC(Bqn)xtI_~~PdJ{kP~6Ak}O z<|-B7G7=JVk=5|SQc!Td43T`1z8%5n9F^*1`pVIN?Wy#Mbv0Jj-Dy0xK|thViq;Q_ zAnc8auj0~!y|x@Yw>Y9wR`LE*DOCz^w3w_2Jk;1Q*9EFWV!CZ)g0DAqG(H%7aqn|B z6>g+iN3B3N<9CtnzDup{*o#aU$={;Q!cR7CwcX)MjYa&Lc2=bQvj6w2^b{Wn4dbbmL{ z7t2nlpMe@|L2Q>5h$^5~zi5DzVX*i^7tD-VP0&i!Mu`ve++^U^p2#`nv|HI`pc_7c zUzdYF<1IXQs#EUT*DdfAWX%x#>C2Q(hg zBEXJGY3O3VDs@p8rjr0kfHnp$Hs|Bn{W(N1xmoN2MnL%jo5^7dpc#pCs=C^lZrkrp zZNWK+UQB@ATL*kV0U$vm2#-@%j1nA{s0s$wL7H9=G$E8I$!H4eKq?CGQ3gPE1;ine z1;xP!0}=(tCJf-gKkQnSEc04OmQu6@Z!a4&Q_?pfNcdBNAWb=H5ex>1=>XGlm{kxP3VMhB#Z;9KM4@-QZ5YWdn(6!M z-n1EjOrHP~7aAIx+9W|KLf@MyG1^)0 zs43y+QT7XbMUDTg(9htmw4m#)+5gHVix|Ey z>Fk`33%Oo->gwuJbp3Q- zQ%$8X2kP}1zzI{re0u>F{BTbcA64)-*zG$;0c9p6H0BID9i2>Ec&nS&sQLEuTi#CS zPF;{+bv4T^6p+Cf-s}oAKAg;Hj{bmm>43@#vyJ!fKL2MQ8Gm!WdTNxodOctqj@oIT z8}~j_KH^@!Z@k{8@ZGICYLpaa5cPS#+%A+TbVdtJVCx}qzK4kN zZ*XTtsn*9=JKa$G2-cEtSP@j{IEJ(1M3g$$J9N2(RdJvOM=gXEA2>@S_ z|HOxcf&h;oC?iqo4`8MF-Y@!eVOb|0SXPAk*HXqGCwWS+lB6OOOCU62-L8 zimK>ZPCpc2*p(q0l8AkDbHA&?7@eg*umBF!CM>DHRYhFE%WPcZkVWHjHzP#yuY`MnYB-h>j#MYz~@hYDjqL+-D#4#NoADgXj zS%ODS&Mpjf)?CceK}HUrMtkVYaOUBongFYHFDl`pf4q8Y6?FNuR`Zd9JVq+Ny7xSj zMfhW!S}ip(q^sPSlIGmuG#2v4W@JpJQFo&dZht;EcK>;Iqh~$xPG& zwT$$X5uZ+ej)kL|5B>RLstb7?w@a%JlG7=*OvsjB$XiTBWKnLrm{u$5S;(Rg^u-yk z^uCKbV&7%*W+1Owvx7V_QFhK|%WC;T?vISl*M#^ zZ)D2k`!y8ZUP}{Ygav`~v0oUU-dOFNqdTr4W`JdgiJeZ48;7DcT0bIA-!apHrPTae zh*wtn4(|DqN)YmpQrH=;#>XDx-&b}UJo;3&(Gv0@IeecaLTI8k?>fHXa2ym-vFx^J z1t2REZm}SHCrQzYob|ND_ZecNiHi+@qfXw7>uy3( zZ0=-4T73=Rw&`FZPP1)H`<_77f}DORu=f)vlyuFj|IlQ8|h;)P5_hP_KR> z_)hB5tD}F6UlJ4MT#0Ss{O{UjioX8p2JDA!jIrCMFI?FnQs(}0f>c?wYT*YDwUbC( zxgp*HlH>L=->Q^;*km2hI^NAPjaO1-p3i)t-bDP2qJ-00-b+wHWsVSH9~_4jf|}rn zA%vk7)D^v@HS`Rp8V%EfR$t15T`-=X;J6xShg-m7MrPkS{7Ha{d0I^e16lWEJkBN) zmi!Ag2CBX|t2qhsf~P*!Z%Hhey{XO2qTb*w9Of$oIe(F4#XGu8KKW{$_GsPFC8XqE>?{>Y$8i`|{iF6TC6RCs@RkFnk9e}gz4^WO%Hk_bjXJw?M=~%G(>IdQ1xuq(36Nz!k1gx zO)M>mdkRoT;TyiX^>7!{1#Sso}+bHa8y+JzFmup3DzC`87N_61uhFfP_;mr#413 zIJwuPC>N6BO7N;}z~s(iHLfHZl3%{seLvUu4xK?H3F|wdJ`0)Lr}3hNgWC`{Jftdj zgyk@ULMuy)RI)ay18tYa`cl~|PYNjylUN_Vu%WIU4Ww*}Z8{4?I16&M1^fYA(~c=` zcQ4(r@h_&KP+~oersN>3_A-4cjLNgyOa^HKD&z$S{rl{P`#3G0wS?mI)S08dO_?WD zgCxQJeVJq0=!7A-p#?ScS9bbtH9@wH$;Q!QC;DHm)ZlPPzLk-4guWXO7T#K}@l7?_ zC1VfGL5q5{I>W6uPR#L)-#49M5})J8)7KgsA5o&!4ms<9qYT_j_`DE0(t|W}#r@PJ4$P6KIIM z!V$LqC1ZsLE;@nP1+_ZOx6-krtCOncz5E=^F;gjBoocCJVN4nqU-bOQW}FQEc9L*~ zVV^=KmU481`v3vl7&;rYx_v8`30f_BZOhd793fNJhqMn*@-y#Xpt_M0pZ7RdrrUiP zk@22RBku5}JkP}!97k6Y;mU1C8hV`RjcrPCCr5=gqGH`|_$%+`KjIlr%1)LmtFBND znM=^t4f%EdiIZE8TdFY~_Ja%zdKa-QDWWaJQuC3VeC$uJa&UM!BH)k0@9jhhCIWy` zdtSqmzNoxB5r|oc0iA8G;BI4bl6YqzDoG-aVyINTCWyS=(ZPW&Yq|(D?%Bo8uAI9%jNV!GE^t2-P_775{hBcSCw!3vr6-{e)Agolj*RbK^$iBHbS(|_oG6+0E+t=5Z zLu$Q~X4^hhi-my^6#1jzX;K7Lnbgo+NC#E$x$+qho^wcJg!{4OfpF`u=ev%#1VF`` z#4%L%8;`T<-%eIren${qhw->{ZX12K?kVE3WUQp_dD;$y)ItY?-tElEJb(i>94n=? z1Ne^F$V8cp03>Mgy6hXUKwfn}K#^sB-nZbnYuQN~x}JDr#K`^dQ;$v&iICLMd6hLe!qLu)rgo-_yuzqj%lV3PH?K*wxR4b z9jqs=w>=HxnBtROA56r(+zCpQ5XkXp)p4V)- zym32xKb7Oa>~#@P!;L1hg^%m9=QmQf+pfoXGZdbKLr~yJ0}r_VE*d(MPo37U$O3Wb zClegIvsK55IVZ;~K&$5;Z@VA2Y1+e7H;iCUS7@(4OG-*AxeO{TOVOJPELcf@HX1sK z`Q^T@hQSI{%vmSOrMY?o&h8nPKqg)LEi<3Hv@9GDT4()RQc2+To zmSP9g)%@X9?(MnjhA-{6zF^s*tj~ZZo)6~u4xNzjg8;`WJ99vOkL-tM+o}Mh8nK&z zo6{1YpLNr5+k$QSQ$1pIx8}NFXWMq0i0^$fWg#S;$*2BF01iP0){VQ>>E-FX!-L=j z8?cE8kCy97@SP@wuK*GK=phkp0MHewlXN_a`cJl~Zvj@ih%-4;tI`3aT?YY8w_xlY z)mxk_kHnL%UUal9BMRRFES6Sw$cG8LaqffPQ;w%~3l>BuhI7FWY}GAfns@#P;C07j zNr)a)3v0oyR3PmXywF^{S`-Et9|NBzhtBRhJW%H8>o5>c&(=JfmnYCv^&(N5c+pqg zRm814;YG{5tCmzhDqXyq3mgKNoo4RkZlgnn;B?Rt2M4EIREq-r^;O&FZO7C!KC@gW zMB8QQTm7;Ht@5iIl>Chw>hItkQL(U(uYhcr1SGRmUazy>4(&lu@jL{VHjP_wr;TC< zeDU1FNld#Nfa0iJT~o6Mm~k`95AP_`v)4lalY%b3`P*lFb38yh?$=aQ{59|p>GS$v z`T!K&U4TI*f`zs6)+)LsZ)cU}NUQrWH4oi0 z=YDRe-A+;jHVIHCE11f_eP9aA&d`G^tEbVlpax1p!t}HO{cLu1^_N4{$=rc$_SfcS z4fRUB%$rkQp&kS1NgU|}FT73Cq6)L&zwAkNb$zY8 zB$v!2%umAR7;OURYhk7kB0tzzX*LV9d=#VL$H?+xp5k0MKp!)IzE1;rbrNqiUDoTV z=?CGJP?cL5X@|x;smg}D$3X%li;Pa0x&Gm1%v~`8=K$JTpV=v>nL@b)K$~Cx)R?mD z1%V9@?1943bndmeUP3$AqK&!Qzpk*F1uYCBK5s9ks!tC}fKI$#3}tr1DWM%K$5Mly zr+JQ{ebe)Uz^);8u;@eXE)$}>QV z!m*i%yHVuhDZ&X2{qGkKNjurs(S)a`rp@-a4uBktk~`R8@&QtBsX%7CK_wa<1S*b&jSMv21X$WF`CwR@ zdST<(QYh|IG0tmfB?g8iIX{c(fkzN3{ zC^l30g^e|e#bh*ywnpf-30XFW4sU%mI}Y|#*y+qDI$4EV-B2Iz&KZH)_8s;K@YgH` z1hnpUl|VlL2TC#Y*1gUX8Gc{9Gh+jH_Ae^mh0y7aAIkt~<&sI7ITw!-i*F~8SltF4 zrYt*E0~(YCkHTM2xn7T7*L}`|J|N@rq|co2N#HV*i04*?Zx8G3)9X!@Cief{LZ+a^ z&>YDo(DP4DJ*F5Mv}`ffP*$e?_AMCj>Hi>3L6+CYop)0V1O0&tNUv9xlzE*Z9`X8crV@XU*Y_nMc>+BL_@&8T$o*4e3}VIbKB{T+Y5vo5D$l)19Co|7N%&)}Sww z;#p&#%c~7BBqyaLgBr`DTk`CuSG?SGxffYFq;kolV7I{$=qI`7R8ty-|t0m20Z2dz*!Cr5WRFD z7*0wRbZ?f+%0Bpc?MB#`X}bCF12y<(&g92!)2|RA$S)9n_e>5K*mn4&@1TQj^wo&r zCnh^dJH5q84-iE7@-{rnsg(*2YbM`)ghpuERM#`=23D;xoG^T~^iMqY=C>wamA~|n zI54L$=;)uuFU>z=m?zS)(A2%m&a%169O_|X$fm|<%f9Y45bZ9fF-9L`J&QY3I|gFW z!I~evn>G~ExCmrYF_%vU?G*+R_8b9G+G~@(Mv}_8_gJChv7;t~REWqxESo3Nm>Hp% z_W784@Mu^ICI>wPJm?B+g*p)dnb0$JoW4@>_aH|?$D66Kvmv`;?TD~QWx%oi>}%3YgxnYPM2bKF@8xwZ zkU^`M8vRTN+)^v-cKIb0h2cC^Xi9Ez&?c_gRxxiZPeIU~QFoZQ8OeEnLLRg6p_^gM zgh=?0Us_t#-`Vde54PWMp+$`jb%jI4CV2JwJBy)NKe?PI?IW+UNwD2@g3+XoSxv%- zpW6#q27WJ$r&ER>`2!fqukH$gqgg7n?T`6tTZ4cwrlt>r#aWybfK%V7KsHKjMp+G< zCG3z(&C5$21YXMnm5OZEK6Y;X6YpDhtrBlcVkVV(lf=A_Es$^VXF8WOiKMr|0LE}R zthyU6g|}`z6&X<5co0&NvDsN|Q~!N1zV_?TpzO2)6u`_xpF~)NHB5fxnZjfFvtk0` zyi34-KskR3)ZAAa;B7y1krGo1erx_K?%ZPXp|%(J*si>|0@7oJyO8}lU$Lky*4MM+TV+-3z=mRJM0B)R)5=q+N^H(ZGzylE=0Q2Cjl)uoq zxvkRwE7`?@^4J1OQGH)k4xCk+f^6JwDClNX-b3Lqw}JtWh6N3jVtuyov(HEfm{URm zX=6!fMV-S20hoWV!Qe>>jdy5)ltpS&5%|6XHn&BKnmr2XZEFAFopdcYoxFOYMZjH$ z3fGjeuZaQ0t0%!m@r!9K9Y2NcmqAa2=N~CaAdz%ua4@_7HJ}X}Hr# znnf-$iQaNiMkkt{0?L0L+z}rWkG&cFnpVgJ1?ITTjZ^U&mMmZmp4iDgR~>jmHt;=6 zvwS3{Wt51uvL^+Pfxk%rD4lHRICl512z-PI_!%tP6r(4ZG!pfVF~Aoc*aZ3gDIc27 zPXiC{jO^)G<04(egITFSdK)9~{4n8sK=1uDR@M*}MxqC1>g%~L!~aPCRH135I+!28 z%VG7|=!lzGTTxMZ_n(pTMri&kn?U$7W3{LcBh6cfUN;9>L{-%n0UkDF)n}EYr~F^> zeG#Q}Heuj}s;!*+9fK35kdXs;!ZN`l%J(GLyz1n>h%f*1rLR2l4!1?A|MQ(gJEep9 z9}6Y3KL0z6YJZa@!bWM>!HxtjZfahaDGFQ+7|vxk`N6W|S`~Z&4@OQ^6>!LG)hrQ( z7Ze2kSepinBW@-c#cAnX(Sy9IDg_GJbSfYX91H{HKRLBmn3E&?54=1o%=#imqNJ({ zG{>v3(KJp*C!?d^Wo4f_HY)B&8G6j#Sq?d!#?kP@EpCg(FrCiIl2KBM7qi0Aq2gNi z9b6ABRvG5!<&goeDypbRI`4SXd1H8o900_aG=N-m(I*SE3w+!5hl19};}S3t&JhGq z5sUlP?qp1T;0Jr)gxK%W#g4P)lSC;S(Ldc^95paQeenK4YLk5t1@jUjZB0d>dIcpV z(j;w{6bwwvkVrZsi8I;rXE2Wqr!6;Aib!F?hMKL!qI`VOkU~0Wbt`IXfLpM)MUKmd z4BP>lUmS3*Ljv#v@;vX(Vv~|60Xj5@$ZiEvCZZsxsqKsS1um2^nHj(1(8+S%aD-O80Tx&YMFV1Ca>%hSNwSoDjx;x=^ev$u; zTq>Kygg%$o^W}g;4+`h+9HsBy^A0$c%}6C=WD;&}oR)x3-viP*vurpa@MAD^EJ-u$ zORaVP+3^c~5{XC=-++^uk@F@`H%FvktPZ>#QEXyj0U`r4vjWGun;}^FaPFwB z{hmOC7-b2}6oYk^n8Y^C8xUif6<-7hn-TEw@j#?Ux5v6^FR|I!$SCNn+32^n;KOY0 zRSqix5H-@o;daR5yWb4F0?_XvHz+xi-_s)1_wB{C35YEy$JE4>1B6gwhRI@t>oLxxw!h0#3G2W~+k)`e2+MJ<9c1o{PH1Dm#Tl~)&XLJh??m!Ss;HQdr8XfI562*>xi-?FIGYmI)7QDd46=Euzh9)~&L z^PE8j+AKAu6Nx|;UQ7i*x7@qYBHyQo02FL-7+nC?)Cy$MDBua4qJf<@0w_);ThS@- zI#3{J5P8qVCLVRhd0h7r$1pKqMb)A=7Jigo7KH&PA%XZ`3m8i~0K9wz96sDqoPovQ z2}cD|f_uQEs=7V^Ts{Y}vAMaq$*#wEy+T(I@R6pNg1#|dEv(qm0+G{cY_oWMk$ERX z4iSK(8eeR7EyD=MK@9%|j#woEe!p}ZCL3wxl&$~Mra~t~SDb%<4HDk~5Q{IdcRF2G z?zy5{v1uuz@v?Y!gJE(3;CjVfmt`zqhb_1h!|#UL>tk=JspAu%OYsV)DM$HW*ibka zD#)jTlLXHjZ!cMYKQscyu=18ikZUz?Qe12<1IT8S>sr^%eKExh*tV6>050gVXEAv!w&|)ZG%mS@)^BsUuN%Von^t%XAT>XP3j#`|56%~l0b4=C| zB0TbaaBY)PPky4@6Ze3b3bvq6D~DU(wt)0fdycBv;=;Ar2`GQMB8@jbqz-5C4A?6~s{(mgPVwYCt}hge#6 z{vHVO_29VWo^(VCuD#n>=zbvBDg6N}`tBJByB3#{`a7)Y1)|I%H0`Ha0!vX$miq`9 z=jj^DVg13>d;{)mKpbIuu}qV*>8K!Wx3p$lkyCKN_wlIEBhm*Iu*Rzko&a|`2?+HS zZ%yK^X5Cz`@(GGU!u0_>sbDzERNZyW*%0(UaY zZ$~TC;kL8IFBC$=pJ_FhJe_o7>oYL{HnkIGX&J`DbyATcdNN~^HtUT*UPM4by{9M^ z+Q!9JPhAhuL^U9Yn=c>13-P(J%5Pd>BRTy7!Q35qh%!w2`ki3=`x^}W{w#}(bQaaM zi6a0D3u{rD4n!PNO{POe4^1xtz4+)>)N1?x8=0x!-Tw@N&<Phy+J zKFkEN_PM?`S6p$nCDu5aZ45?DgmJ_}=!H&_ZU(6#@@{VCDszu*J@@j63nZV>^ z0pJjt+Ujq!NVzl=EBo9&u(C5aC==UCI7(>Bjs;+%dLj7lqC-Iw%vF4w+{0(7WKC=8 ztAoSaiQt=?jEP=CwEhQ3*4s}@P?~R`nVk1>W%nF9Y=v)(t{M#u8QPPA?fF{tPkksX zzM>gy_y56K^__<_CWBr;Xm)p2Np5bhav+kRBu<`rqj&mCax7SJQ*V&oG9&H-tm^$glznwnlu;X}fPlaN(p`dd3>^{zN;e{nASEE(DIo($cXy)# zDj*<8cRDoEDV>6(dtbl(cK4q>XZJ5TM}~Q4?tS0;JimHwqKgaYpCXZoKZSOyKPmQ; z3WP5%vP$TUv?6G@gQ2c+^#!j2#W|%-@y9yyyY)jG!`W0vO=cfpX=aH{1NSK8`T^bF zKA04Li4RW})V6l~I)wN6fV)Ro?zurG3v)ZGX^D*e_gc>8d(fxlEYk$UkZ~i6veNcv z?|YzzV*@g1<3U-|8mft86t?UaJo88QpQLFFz@IaJK&9Mg$YZ4MzN78MulgGtq};a^ zDvHX=Df>4eCu$Xe-=w<SYG%;T_8(;|p4V(l`C?8&44DyeZ zeYQS&pHh2UTwEzeFIdT0m*C|LH60cG*IImJy2-eYfM?_tj=JPU5hbb{J?sV2SKq9p z{kR>&zP-2#9464WYpgf-D*Qj~$8%sAEol6kik<;${H>g55`z5DM~_;PJ&TiiE?{m* z3;x1!Sw=??O$fP`FQl5}tn*w!c@{bdx z9fbQ|W`_KyMi`Wc+lxY_I=2N)kF$jZi{f*K&$SD3|JI->oH<17XoeMPNb!Q0Lsmp3 zjbDN0m$V10&;wCH>E^o}rfX4bP{ah`1&4Q@m-+O)y znsZ=7)>>EmW(W_oDfK95oa5;_@o(HMfm+gAIECg@&5$Hwf$7dC|0>W!VS%mFt`Bw4 zx9oOj7~@r0DQ8`IBqIozdDNy7>W@(>Z*(~gwe}Zxt=UzQ7ec^&uJEtd1*r=a;`Av0 zX&*bvKEf1qLcQ-jfL*Rq&^%B1Q~WyUzYTU1EgRBWA)}}yGqYc> zHgD;*t!)kJxZb+{w1p`I4S)|F{JEz5hvj1q?P#4S8-A9)o?Jgx;K|8nM10zo>Dez~ z=EwN=PT(3(N~hjcEg&e!v%|o~=;}4^8vF0n4zy)a027rr$5ptO101a%9;2ih2-3OaFzLnkP!2V+-%iY)^}po3liaeWRHyA}0gV8308$^m)p}7o^E7 zaOC=*rJwHbKSto{%Odb!sDgUkPlD{;-SI$H9-b+YBR#$f5{O+25roO*U}+_ z*|w?}ay_y`;Tcz;!Qg;IX)0MFn&9!rAy4aFv<56L-Bga80547gY^JW3)>x6U2bsio z(1NrzZ~S|r)NZblA7K@{{L@aZZsy1-M0#jdiU1Y{Sq z_%xf|0CG7Z9YD#Sdn@NH z8?xA#Qv53m~Sioix$cX9NA#7fQ51Z|C`|90 z!r=Y%48atLM`z&XnaDTI$rPftwTCSh(4S#H@6a^m{u1b+{6D+~@~K zMy5`5)}ue0T74|qm{icW3=-mT!W%)I;&-d|JXo034WuM9qv#IDL}3iv3IA46p>!f* zcA@Nwwh#`QT7;G4^4p|4D{5QJdK&|ZVm`}7)M>F+OisQZ`XCKH|if*ZdE zt4%K)Qpq%C1U^f-H2yfisa?4GSkt#(lA72jmR7<7RIXw&fd&CyxGey?Fu1d-n&*)V zyk?>MVmluSmm59yDvn&ZTCfgCx;ytMC|U)xgw0!Dc^VxVYA^S%@U^H?QlS7C3+wrG z!Ct?S4g_JL(Y|HZ<$cd;JL+$)37ue#rwYlEP(s;|-?M)h7lgv*vNwkn{algz_pHvt z4{B-CFS1GkI#f4uFPmCUAyTOGKqb>@8*EG#(>D9_+8MmGyI?fG+9F4tbnwI1bZkZl zRBN*!2Y}W`fQXUPo(=G{{4zR;B7^KEq+yyS-pwF7Cu?@QQ~wAgChR@=%zqcjPsoTT zYV=$$$?E#gk;|mMb)V=asKx`XKH?pLlMntNF(ZYwd`nSiH1%+ww${oT_wgwhckuMNBAqV{*k;sNo zqA1t=?XrG+Een44aw4A9&T0;|tW^=(%;ATuleIQ)i%tv49PvLxk6@{oEfN$*FfTv7 z=r}icK&~}CO=oOWJV8(WR1%wZ1`GNFnz=~xUbLA9_{XRtY9>oLOMM~dn?oe6gndSa z+IvJtrAh|R3kNf$3}fT1<1lTW&^D2+N#8y$$q`o|>ro^cQ39_i3OO7og?04 z-aDR%x$Na}%rbS9>gF8f+1Z4>MzKJ$b$HC?vLPBz&=OB@5PIJ8LY=lKaYtmMFT^ug z+{>dG=I`>koK)9y05uC=)yQ}+h@R+E#t8p9c$GbwOfB%ZuAOu8$0 zS4&GDwP7#Py$c7b@%q@X5>nFF_QXH3Yy1fSxRO{gCmk`mUOmH7+Yy8;;@nNC z3k4co4QtUgG%Hbq_D>Zwu0Ku~_$hGd&Txh2=pi^ZAdpcmYBcr;0yeHHfm=n4UYPppU&wxh!_F|LH`0XYT(~oXGq2QOD z0ZNFflkEfW>@sLI69L9cR|9f~+KJ~Gct~)PR@p9p2Dn;zR9eb8r~$@M?SOiEsy#~Q zujKWiY;@)(OZ12J5r4h@R7Icg)au@FH;rllgr>5eKuR*pCt|?VCg=^&(^~jX&}InG z00~spaAUXc%W zN@A~sWa@}}dl7&1Qd<*yz=MVpcqePvBup*UIg|o4@Z5LLEcg4zIv5_$;&rV2Tyu4lOqxs{H+~z<>DTyq)(LX7f_w z9#Z;tsW}eMLdJxJ1}uH;a8}P<^`a2J5z70$A#QOav;lw2AdSc~T0dE^uU?(_9<43= z53u?!(48{}w7cf?6)Ir%0DUK8Rf2==_>S-LJTDvZ`3eP)Df!OWlx9B@CBi??hMUQp zd>!;06n^WnX^63=Ao=u@@9|Q)9v>B2v`ds@y6-VpVQUyJRmnw0(|~aMH(8rROvvnd z0dU3NG~oIa0+y&`XxGyJ-WnTgMN66Dp<&i%ljtel# zKVmBc_fs8aNaam62i2GwBy^Eq~N_09d?7)@<-W+t_9OLGqen8H5jFfHyi}Q9Md~t^fowKl;{Me*KL-*};8 zwkdl2GYN06o^VX{KyanL7a_j!o_(3UnKg&I;lAYVQwwWglAUgh}YxW9L9LRi#qikQM(@1qE)_3X*OT@8YQk^_~+{&?}`jY zr@G1SHEAE{d5PtFeE5Cc$@@dNqo@%rSBoW#h8_Evg+Z{mL1v?#JjI}6++*3qQ#bDn zZ>*?LsJVeFn?>{AYy5326cUJ!G#m(Q4~$Kk#jkbc^p~Vx@!Bd*1xa-I`5K5%e$SFF zi(h#(pVDSkDtl4M$$t^IqqvKU`aIh{!2}HnN|PXW_SY=r9@Ar%tw~AiWTNTRJ3c^_ zsBzA@FKzTr?_#_j6a-OSn)p~9cvqt4t$S3F5R)VAD(&qY@}ajYx_7jG`ITKsbEc32 z3>_FQ7*Q+#eB?e+eu@=IHNzV64+|RC5mqW5c*|ku%s4yWq_h3+Pv$G;XH4KJQ_ThZ z%KHIacY9AvVu`l~04s_!|7@8e;l_&WITgR{z~N9e-TK@+SmD(GQTL6#D>?p|IcwBD zk23f{*NXhW^UwFf2{`Q@vD|s;!zj!vXWIYy!yXB9*1gT(^;deXcY2y#IUUA%R4-wm z-TK#DLKqCm!?u{vLB-J}JGYdI7i1Cz6#1Z~Hg+LD$)b0kc<@~;HO!d#Y~tPcRDFjk zIC3)9hXv6){$P#e9RY#K(~$8AIB&dyYVD$Z+tr_KR4whT{nazUsBbZNf+lH~vl?ma zE#ruGr$L-mNBZnG`1MB4?|uFH7`uWnEFW0~yJVcz_b14$_Bge#FE-o98?Qbe^qI%Q zf18<|N*0(YzDGE3b^rx?jJMf#6TFv^7$Ne$WBn5~-!tzrnDHMHdf9zdYMhRm2eLe{ zbNvXu@mk@zFWxq*B0Z$>Vf{{KIkqYJ-vYKQew`Vo2@SxRVv`@pdfxE$jm|4uk~65= zNV-w=DE1)xs{;rq>f+=xEq}#w%kFQTr78mZLpXg$_@bp(no$4!i{jEXC9RjA|M;&d zr%D(bMWd_@v_C!lq?EGu_CuUF=}1^|7w-Pdydzh7v=i7;^1W}1Py92M?by*8TQNfK z&~;o>{N}4(wB`0>QPK~(x~IfS{e*pfvH!tmhXS|0Z8j1d!qbqFxD2UglkKdawlH0I z2Mb9k?xl3{pxr3CIO;xQIj8G+x=z*k+UgX1fA#R&BDzCMfyf^M(xD@p14CTx?bP;} zHn+(s^s4usbWAN+FLbu|SCQR<#prGD^``;RA$Hrgl*T^W*}P-Zez_HVx{vGOMa3Ti zzY{;OSUbr}2N*%W>whgv>&SE0NUeom?e3|cbZF%@T%(!Xj0p31&L= zFuM?&_xP##@0~dH5%~wmL^D>pgwmSVE@r2OG4vf6?e*eaSLbU%um)={$s%M3>}J2g zkAY&P#1W535wYLYKRVA?0PEIk(ST9Zona-EL#di5yQ0TXr@lGV;9uK;z_nU>$ zJwSQ!nW?dZd*oYiL@W1eStwl<^Yfx4rh>^=&gfWFCN2rZEa?w)W-i#SZY&joxLXv8 z`-~oREmy6`F9{Pe8%7zAhF@hpZ5J6ZBqR(r^Q$ybkI zCe2aVm-hR9xzv*xpT2u5WVP;MsVA5bv#}B zM% z5_kn3eS=**zu+$Hi;Ji}{Fq1k52LzVHvj|5yQ zUQJvP%W%z(uW~i;<)bKFV+do3w5<&R231aQU2zH)n%(=YR(K|21;`|2wq%|CV7Dg;!WmAY@~@ek1jf1+Gz;e`6| z3M^u32%eb28OUYT`2*p}DnN~W@C9fm1UbBKk$!=jd2jyS4q-?Gk8chmu6P){F6kaCL-M(OSkyJ0uzs#6&P`9*v|#0 zjI@Es(N*xC(@qOeK2sh*!UaStV2+tV&s@E*3W?__Z{iqa$N%K91zn+FbSSc97ro2f z16`$#V+ZIYnIG-dB$lABiE4^|>{FbziCnVEcwT8(bufJpi_J zt7!+K;7W|XxdzuvdHnnMCfG3asy>uMkx>{gh-Xp_`R^i!5t#7Hn*s2D23c#TyC+`_ zR_y(XrQ03B8-=_>n^k#&F%}ejRE+jO6f_Ia+-j~jfrO+jM>*K2bDSn-r~Fv3=ifs& z5X@~md%gtP9PR?!p-q9U;oJ#3sA0<(v zAR^4qZE<&!1QyT(h+-DsJ}qYyp1Y(jY|h<4xqYekR#^T%OHV|q-65UwSl5tT| zLOBB^>wrZ;nneoenEG`Rl7K{w3}c>V%u;15HLYJ|W^UAO)FmT$3r#79_HV%U^2r(Q z_xmdMlNUee2SZ+<=I}DNM@)Ve1;t}S|N4Tn+0Bk<6ngQFTX7L}FCOZah%i*t^P~u(H%f^J=cK<`iER3Z|rKC zgtVrMBSsjt$Ze@Xj!Ejj%o*|})x7i|Z-NKj9-zH>P6B>Cs`xH`J4A5!U!CbN2I56~ z5u|keg5-~!kY#(|P7Z1YB^=e@%VsFU7OD_U8k5!f_z!?_ z+&ttqf}O}hU2;LlL5!-(xME(QgZ;|~CG-A3P6F;_k54m)Q6#8D_2NJ~nEwLDtP=fs zqBt7n z^d*wacHX|=Yx;YMbks|+u)xI8m3!vJ$O$WK1_=i1fI?)+=b}X=y%pW9?qC8_R@}zn zDcJy&Qg;9b|Iw;Hgdk1JMT_Y=u|ESt?{h?D70ogQKAt54;P=qaV|SfX86lasCem z5Ug(oH?R#HgvGKoDg9Tbq6;91U@Y7A2!xWrj-5&m6>{bh5e87MbT! z04Z}I2&*IrEn$wooDEDOyEt`}^)K9?dUn~hILZXn4_f-O^uUvUwd1)-d73d>5o+eQ z@0tE{Gi`B-Y5gr5qe>#;cl-D=A2Z4>T{3KKRdP>tvXTmMFLm720O9&oL6W(#(kqj> zw-4uwqTOUH(_eH53JTeXu`A8p7#m%Rzjc0-N%{0#6u*|l;XSM`Pj0UH3FUlw(OUtE zQEPY42a1%&e!yr?6#lh;J`z5`ZGVmTC-XGDNkAAQ*lgQ2?UWU`?lTe2Z$NJ=w@!;K zOTFu=k~?H9d%f!_6+pz}d^~rH!GeG8N6D;0JJHK14jK&`>)ag)efgw(-xg`mk$CaB zZP}_Kj7GjObFVPKMr5vy+`eBJ1u_O+yARNeD(=4F!C9^n4>gH^r1wgz=?xnTC6@ zK{~%cg>k;JneG<0Nc*dN1A@^VR(h%|g0=uz|3m~Ek6Y+hj_{Bv!7aHzmle$A-1aV^ zWmgmm2@@C}cg~D|EDsZOYg;An>$4l>s~b&N?n~}dqgRcPbncqh2sE~5`c6*NyEVc| zxQ%tVVN!pnV5|H3d)SWcA`fM$DZ9JSsHSoK3;#0RqD!#UWiVtwIiAt|-O7@jUB&_Tw#wUK*q8C3`RNa&;I9G5rOE4y{9O`Ar z=C-#nNxK`(BN4fYRT2%~;?6Egzp}9S=6j#nEtk06iH=O}V6}b6e@@PeCu*T5w2vw3 zoA9xwcbr;#nK)^GoGZJmCl|jUA^y2p0&_CM*0TnlqMfw%2Pams&4k(Z}2v2U;j}7!y4m^Rp5h37{2F&<=Ca< zD_hsYv7g(5Nqex&eS(R)VIbQAJ;JJqm0Ab-h;{$AjT0`jUUp8?~7vc=+g6fi1G;Ce$ zM=^z+B4NRKag*?UM|c#6$h^10fYVfC^eUUpzGn#s=UjPr2?iT?=8oE3rnVbA{0<4$ zYb;r5<>jh(oyC3M@Ub;IL}pNJq$vmyEtK8HveLuu;$+U%BfJ%XCgf)gxnu~DX1I7R zLjMUZcT}N1Eq9}p_xzl0lkFUXf#J=xcNS#?^7CvPrj? zw$o@lrOL@4A#}W33@W&nOSNqeZOP@XAG<~fd5@A3SVISfW}d3{PxaK+>kG(p{P_CT zl&a{*_tC89Iy%KW{^R50;g!f=OFRRjuEUQe_30%*M6K9~xpiO-TtbX*m(ymORcYg* z$kfX3i~$-FRuCA;E4a4*IvV6_s{={3UTIVXnBsI+1h~+%dCwO!Ae(TlAd(x2o~bwK zJ^4Z31yp~%`qtSGPd0{djORsL6+}S-jyvUoW$4y0Z{VMAP1Vx@7`Xay`di8^rP7jf7`%uBTW@9oY&%5k}ouDyqD{l?;q}bG8I|G zX8W}o1R{toK%om|y#i@&W|;~vhK7T7soI21+Bc6q%Em_5$I*T(oe_-=ou6CzNkQVA z>?uqzy!~YHGyL^5+<4G-V=$fl+j_L44kAFLEr1AW{$g6Up`qs0~!G^LW*ts7V?;{L+ZoOw77>DWc}oYIEA=fJO&6>pn+>mRLJd!DZ zM>Oc5Qtkn%3m>)lKy*Sc_@moVUt*sf`Gd~K0LN+w0X*pQ6vm$AJG5PmzrN$GQZ_ zi>!JLilA4|C!vP8a#p?`(0Cd@j|?f%y--0i=@Z<=LV0}SHw8sSPaxMvbcn!_kvI+p z+WV4Yzl)0tpKIWf9lx3fN}|u*ly<#xRQ5$L=%$@)Fre4Wf zu#O`ie+0tw#U$5}=5e;H9wUb9BH%`s2Yz@fz?wN`X?Yvd`W;dL^|UXj+ZINJmcY~* zd7R_jU65XXOqE^W{RG4&2$p6|CId~q5b>8E&+Fz~l))giNO#dqpm2I31T4Zj6;L|@ zqmwj(B6}N7wH8K>&h5PUdzFje0vXT{nqha`*;iX+vpfB1e_pCgmHY#F#HYY-bWd*pPnS8*py2aymI#~4!Bl#`|#Go^8*Q5zqQ_35~leT z$KT%s0GmtUgr*~Q)0!VFC?lu~D3w%v7U7ePU@XEL2(p@>UlPfo`vVZxnlFq9y7Ml* zQ>zCL{g-7?0%P92Gb0;7XZNhEkQ{Kg$!uD?| zSMhQ=WQVl(a|xcX=wYiS5QABE;DC9xQ!Q&74ty}9W?^<8c(q6dr#~&e=V+{{c@l7a zWH;k7um?OhxJ4Be$cf1Sw<&@8jE!d?bjGE2I1eXXCjAFGYbXdY0GdSIY`*Cl{U9$> z&T(B~1#|xpKqpR$`(_bPB$)y+Ud=p>;~hCl?99dS51MgCM1A)U$)SASN{XCw09d|{ z%so5D;jVvdw;yGdcd-QFbl>mL)~~auW{g*)_JjyLahXpPK4b^e_QGtvMyZ1{b9dSz zfAWjLDIr0TN|Wj8Kewmgk^vF;LzvL z0pcMP0g!4WeM1KVZE`u9&Y>|MsxhKgZ$vNF7*8|C)E?;q`H{A#!;8`gb;U?72kY18ft8VxP)yC`}~bWZs5Dx3#h|9LKIe1EO{vNZcF4xIPKbqhY2 zd;E{X+9P?s%L&a`D2pxycC3RZVbXGku5Op>0M~hg%+&+klf`Ma$V8*)6CJ?^;=22! zhNCG>4IoVd$qqhT?+E5B~;y)KW=+g{T33lMQo zrX_hgD(jxBi4LN(IEOH=qcO5i$Y~zSOCW@h)`6mT580jYLP7#iVa`m$Alt@}iFR|LRx&yS|4h~{?C}SSJ@Sl*}LUQSUZoErYpiiN(+6SHVDU_0ZNW%mE#5FKMI|i4bqXT_Bo zYpt<`){`n^N>in%riggX3vyAN3#J=lu6%A#59OQ=peSjP{VoabztoI)mCM=uoP5{w ziIP0x+31=}bE(%)$1o9CM?ZcL6efB#NLgaoFu)E?j+{(I4rzH=MPi#27-{jWI3Or~ z9h@&FoQoc$@OSVt^2!-Pi(^m)Y}{7m zTkTzq3RVHc+z*c!)S@#(8B7&6KOBqdOHdqdnMs1 zWfR;x)uER(I%0L$kNDB%k0qarPAt#^huy<83Oj23l=4nQT$~ZR$<4*_Yzky#(TKM3 z6Q!6Jfgt0eRJ^2x-lozdrA)&Pp7#i2s}sL;GW zVR$s`z`?2dtG8fjJ`MQMD6z@Oj$RWdalMKwXc&GK+ws~0I!S^AXRh!^;(!)@)})2+ zYt|g6uK5mwx3Mc@PHYeDh<&JmsKj5JFQqd>uJKej`HDf3=w2ERq=^Vzx#(px9veWR})bP_ium>oHO1+W}t*`R5%-{Rjr#%8}R1dv4`C*wTlp&Zbs{9PVLO z5Osjt@v5XwjDGBIMG6uX*u$f|%ofJQ(vO5Asq_;8kHk>MP*lRAsP);O89~5PRCF}Y z(rsz|l4A?Th+S?If%J81TwD7oegOW{Z6}c#nCT7a!2Dd&_9#oB<5#lmOVF(VgIR=F za}28(aj)~)qE-P=4_-WI)c-Cl`e-WY6}dCcSaQ#5ob10MB@T4xHwa6GN}w;5n*+ve z|1j&$_1(OG#WHFTb9v?Aj!PK;=`d?`P|CUS6||{RBx42tE%K5OCs>r({>Knv|4Gd< z;)OK}?tj0vpnQ2wm-azR=%w>lr8e66kYhxSAM1Lv|Lxr?B^aZFsTt$V#_9;|zYEdS zrHa%4JE?GiO6Bp2{0LPR&w6G>4q^7TV?}QMvP99UG8+227^>teKqIRH~7OekUOLJG3@k0}9THP+UqSj7j zS9E61@LXYQcuEJbg4|m9=;OUbx>iDwk%+BKwhk8DzuXf}V zu(4PxrsMIFZ|g&RU?R~LGgmJMKa0K+nZPT9c=_x^l93@JiW*OKPcuq@^*AW<`M&)4 z3~=L4X7A>G<@N!t6=3yS8%f+3~O^TUo)` z45{TVB|1h?zVE3v0#iIlsJznOr+q!gBFLipxgqu^3(T3tx@g;fMjfbJ`t~;}oww|O zpWt2MqhocagCND18gNB5MJAExCtNWntSps^t_ak3@$n3-K~LN3Tdw68?jr>4G`3mM z+liic)Dy9CwKFxsBZklDRa`1?4pxTOB;Pt;*{DTWS$w(^qg?k$-EW+AM#HJI(5zma z)qaj~o^92XJZES^W?rJtT3r3)O9Nb79vxBRuj z%&m?PPvy~gh+=KSB)Xov@tLfrA$Fj|R>PT_UXVMx?VOZbcHyC%#xiHF03V6X-Fi~* z&eat;tV*ld|LutVK_DP@as}80$UQWZ92tr43ZR4}$d=I!EEM|s`uT<3{Z9sPgmk$z z;9aQW3j^=a)KDH-&MD$FWgx}2COPy;N?6`3`5)3wQ&MHly&x`ov~OFv@aT9UecvuG zbo04a+S16=j(RE9jAE?9Zw|S3(`6C9S3}DUay+ox$$86rt9zIoA2Rp={UcoLd0+xY z_KLkl?p)-WY(BWS%&aQb9a-SBJ!Mr?!}AGiU;S8lPfaXoOz`RIYehJ%XEQ(jO0AX0VrQaLOwo~Ho>X!e zw3r#pGNojNP9E8V>Ful`ab1M+^dLi{AU{-jx0Um?32rZZ1y>xc%hhi-?4dy-N?2jm zJMVuF?V{p`+YeY8=3iP*Gwh9PzL(>;Z);Or+@)n?#3~TxZUsy?01-jwbKOt@@eSW^ z9bUbP+upYIkEy$V?`{z=nt-~$7^p3_cXobKdpvEs!NI{1yirD@Nl|3< z>YAFAR7P!pHUk+6EI}}k=eM}HJb#>;mZ!folzo1wgl_$d3c)dxIPHJga`rBU#3|LM zay&U=agjdtMLwfVv>> zzz1Y^f-D5()B9_in**R>W2UmNi6Ho# zEQkyks#}r)c1T{eQ&5&0a{1rrcfh~01H_~P6nv_cLi!J(`PJZ>YJlXUxDs(uZ{)Kj zdYL(U4AySjHnj`%q~_~=EOZ{Vql>b<9%^vZf9a!uIA;RmJ>3NEVGIMA0T)z4`hm~8 zz3s5|5>pOSbEg>?`Ld2q3n=eG+$wOweQHn)@&c>G52iafJ|pmb7uN#enyYhR2H&RK zctP!H9q#{r`Q8BqIl1Nd(vg)P2wX6De{OW0#lgj;^HAOIdfijtp@O(|j(15(d9i#5 zyyyn3P$Gnw9E1^`TeLP{5iyrZpMq+D=jEOF0OZrzmdkApXoMCeE|^OC8Bq6qdD460 zjYASRCof0ML}smlH+9jBV$uQ>dN{0tDKbGG9xlg;AFOGT_IR$BKiM`hIqcB_4}w>AhPRT!Kb?3L--bR51ad6TAz;7lvEK6vOaW z4It%9cW66EJiGoE-vRuVv$;VhVKg3T|KeRhY^LoW$VIr?b@(#vSQ?Y2Xow0ZVVu9n zhfrRE;grAD-c)yc_A-RMqW)ofU6h(3(G9b~#q3xE0en-ZE_6saBcq-M@J-4=DjL;A zpuC^-!_UD%vjuwQQPlun zy%I}pgNPBnYG{~m^=HzMX8>#t9+2Fgl9goyOq8m%LJ|Sj>OQz%Y&SMNQ%_G}(2Q-6 zdXqEE`e!!OKB=?wNpniNAmh;Y2KGQ$*?d7D*i8N((Z{6dkwy$$`J_>HegO2Fz3U7t zZ2jQLQF`>FJH`+4q}qRzx;Y)QZ2mntHltK-*#dGp)Kcc5N!z2$cjf?U^taiEUgNs{ zl+JgT7l7F3&lvuKsR1qkR<>+&3AX@mqjO+ZnWp$4OAq8!W(4tjjK;Jrzc&C+$ko&`MHkmbaNQegyii^ILIwKzR*9!!57Nd>rX41VIX-Q3EtPu`1N5I6dkK8gZ=j~mO=xRs<`HHeueY(ak-9A`Rs_2ZC{tDng7I}`jt zSvfwHkP68IY{Jke87Q>qlrm`#xTKzaB76fW-+-_TMAkduj$A_>(W6jhq`t1Fw8gk&8#x@7plSd0+YJ)pd*ES2^~ z+x^|_@SXj?z#gajb6C@(e%BEs013Gh7aiQGbZ#G;!LOUKqM8BcP_3tgDyG}ZqPju< zL*}<3O<1{!iDSEFZL96InjnE1PowQ*kp^kuMNw z*?lLwA@Yiuq#yo!)$$h_GhU_}C=g32sB{oShIZ(44Qw=*Dp?&TZGNcBrbeUUaX^I! ztWwWMLQ?T8di}+)ndW9a;f_il2mib`pxpil?T`B^D;<1goZW;R)hTVr!DrztOIxLj zPPMB!0Hsh7JytO4|7*7&5&Hn>%*M?^0Xg-R|BhsGQ&XkSb;Wy!WLZ3lLX+VB{&(S_ zaPGN`&pwKZwn+;iNH^Z>;k)nBzoWqv!7+bTn@w%`^Rpa`Rm14lpk+14LwXg~sLa#+ z%&ucR&bbd#kqvt_>i?YIdLrVvZv-U=H{Ol_Em*_Ztlc^UE$?hSQ}BIsg9Ycl7xH=G zWh9r|le;v3x}&K|&1?wiIr}MQ##91zgJb+a254Z3IlxC8wXG$IMHlWyQNU22o6>+F zDt6aP=KCzzxwp9F0$z9`J#ZNzWVChQrp!eyG>!#I)4B%JcUfp`Mv>t*Ji+`VlRLJ_ zKdjg-%pN|^w}>zH>PV5=XVN&FKJLkmTw-^_(e;c(XGPoD$zT$VHu#f3hrKQ%-v->| zZEV-gMjxH{tzj0n0pOncJS=gzmRQx0@vF^8HI>_&5~HjfSlcFTSzk9Dk!Cn{9Q3_I z2p0cF7TYD%{q%T-q7t1tD`mby;c>6N&)K{1^#$!#+&OrJz8bm)j2Wmg7Fi z<@U(dBUgGfm(A8#y96s=2+)RpV2*}$ca*Qx@O@U*egl~TBQ14oJ(^ji5#@NLWTu&SeSy` zb0${I1|EOX2hxABhsUxAZ)1qhcLmT(tXY%{d@R(7RWUxeP{d*cee{@CY<*cbf%k|E zgx%=FznHk3Xfsu9Cio^-VZ;0LOzYGWv{S1Ji=>uPdA?sO42zwMT46))MfPp zsOJKYYHFu(Mt&RzCGkBa?5{%2HxialHzksY7n$vkS#_e&hP7Ss!tCmE>@@1LUJTxG znKGT$R^Aj@Z1BXzAl+E3|*LX#34TfaYlOe3`R)v)6#xIFqWie9QJfIzc+h742 zb=ce4%Nealg2E}P{_wu{6k;*V%{eRjDpX4M55Ftd(Q}=JYSgh_lLnK_w z0fU~c-Wl>u0zUO)`}%W#=cHh_3bHtx>hFnO_!ka89i0(>&XGBN6iV5RjX)i|8RL4M z7gpQCy<}S)osi8DX`eo$dc3ZC&}i`NV^PrrPaS$j|9X#eu96R8apJZvPF?cS$mhH- zf_y5W&Lkv@xAj>+jS==yj;8JSwar0kA{{nO$4_miUG0-iM`iOn;tv&vlI+BbY4E0& za9c@6+?S^gQy1q#qU-BvHS&zqMcTu{q}yDwa92Zzw1bNh7qb{u zF<#94BYzI9*a!DWsTNRLY5LcmoJRNV6PgXECh4CrE81EW7FqYbG}N*UAtqM!d?zk_ zd3wOSV_zNP)~6mxJw;gOmYU*$MEfW+bP8pf3IBlzj{YINhEe#iFSVYD3bN z1tn}neck;v6!@=1k6b<%RN4y0ps~NtVX@AvIF(ag`J9aNzM(_CM*1^Wq^OYJ$9hW! z3wG+OO7-j8KU7+lJq0F$Cb!H37F;#m4kVQuc8>%XNrB_fntN3 zt^90@>Il>S-tkqDQWXl$BkQ`N8B;hu06=LuqhBm>d&@)?U1cBqIV_t;-J;~Ek zg79nHGEB2L03HJR%8K1w=!;%P7E_gV#VJJwH=E6o?lc90+hJU4g5q*ZlT5haKqKR4 zL3XU=zfU)=gK2U3VnT~jHj;`rNvRoagC4#@eEcRadE<`p+^+*$o5TOWwh7uqZdwnR zM~;FzafPkmX&b)QUd0iGq)w6S z_w(>yoLM>P(3q%;VU(nv@t zoq}|OG>CvAF?5%ph;&MqfP~LJ-0$v-kR}^eL*SW zkKV6E9vFvOHu0kAzsa93CExxl&&Mt%=LSJH#6G`P>I0?_cC3AS8Js4W3G@I%WwPBK zaCUp24ufsl$r!z?Qxs^%hHCn5u)vM5OvJ*>&T$|jEA@9-{`6yv_V(9Hwh~LKTYY-g z3L)M|A(S6okoy){H-aD={#W@O37`u~Z5Fpb(?Ziu^J!#}eV{u)vTg+ND=F1#EJR9e z%8q^XDrdj8LlRQ|ruOVVwxfG((4&kAwn-9)__p6v^bB`S7)wD~*QXW;&|MF>f9gYU z)7K!Q_zJxUlb{Y<^kLIHltm%IT<6SF4Y@?XZkf4^4*D$Rul?|z&#SDzv4cV7tL^uhWGQYYZDDNqKs6Yl6>$vB^p zZ3(ccZMX|Xdxt)V`us;JB^y2ep>r3Sx&QxuH3*S@j|>>zGf+tytIg>`=4pKw8R@!t zw@Zk4m()B9Jp(VjjQ>9agR~RcLmAFfK|*e4xujrE(7P4~h`N&LLR{b%2#59Weg32l zu(|w5Ff|A-c6|S=qP>%Q=nFhfwj73je{|VYWRE(sc>nX6%A^*K;LaE#+GaEAi@D8n zHK4Jt!9uRipVJG#W^p0%Qx)G)0hhBb??TB!fVld+_3enK>}`4{e=5si8Ji77qo_SZ zjuC#HfLX+L^<(_5(9tDTXo1UejoyE^J0E$^?6SP-Q2x_RFU~tS7?~ZD3^6f-HYd4k zK+^(L8@clG@_hE#qOcRtkWy76993OWq3Yo1sBb`(hP5{u@Z-z19_rDnXB-*Te*gHl z9B+?bgxs>lf32C43>Zrs>rOV)So(RsG>vD9xUYhRJMHMmO;4B2>(5U#WrCsrz~1Qx zr)V}{8gkeiIttA@0#n;Q02<2MGBQQ@Lcx5fVSLtvGU*0FWa8oBoiz8v(n19+S@UeU zL^7tMWLxt>;N5Mxs=eIX3WERVGidiT5Fcd~Ag`T_$Svk&unv>dkL-0$LLK-AP_?Nb zqI7t9eYEiJMjvnpJ_EJ(f{kB zsu!E(gszbhJFr`lZ2qkvMJU3Uv-W{Yd=v z64l7@b8r24%HG!V?~1G&QTQTIAHxKgruPpHCatbo-A;=~;Loxsvsn5}U9wjeSF~1l}bn+Ka9rzt~ zGcOmwiLm_Y>&;)sn25`K2G?QGt$yg+2Gxd0{=0HlTLJ3Mhc@uHJ%FtuX${0maJ#2P z=e8k8MNQ#8*gGdedS?os`DVQ7{*Rw%;dr9~JGH$7fWyTFX+T#{Q#(A(}4eR!@Uu zc!EzO$}Zr+QPXMnmU_HR`V!vtKTW3q%tzm5CUfCXOw^*(SJ&E+y0>@<0=kj)YF`4V z{Otr)6lv`IvvZ&Y-98%4VZZ69`tzQs6QM8&Lg;nm89-xZy&<-*fWYi|>HZv;3U7JA zFMoOJOVCAu7VA^alf7S0AtiS=0tIRyf}(9Hf|sCTG`%=#wn6_bARr5HpGrz&~4RcyKfgnik5M^WO<;7(c)2 zDcYsgP>Pzr3UxcK>JSE1^fQ;Qz=)F$Whr<#oIw)C2#RkwUnwB_m=mG&`e}GWNDk371r>PRYaWBjJfvN9S(Rwk?Y7MAk zDYb3iei|$j!g(mo@FIf)@e=3lM(9WgjRFP8NmWD6z4tgMM0gDJqYR7l{Npa8BXe;< zu_5sDZg+cLum$q~v+#>n9Vaf39i!+ZD^^`j$|e`f!3{G3T+PIcX%T#}WbQZQ-I4jd z)@s9Pv4L4o_UjgMaSLM}a zK%*Qh_kadC;SnGWBDtOU^Nd(|qsZBip8lS!T-&7<;P!4Y;!t?t3{s75008aKhDJu< z?Celw`cq;Ub85u(OaXnZRs7?VXpru*;r|q)0a}M9LlR9WN-GzDh{Fp!xlVuDYuXvw zp9a+xNnvE2%q}|}z(~o&>BwPs|1yD(IGqErw>csPq{G(!f?jintG?*QSDj$tiJjdK zy218lL8xo~U8HlX7kH>@8zi7?^x=B+y%ERczdFdofD`(NnTt;; z(U2A)&&Kh5bLuU1p1<9G5Bxh+?o6PjM(HKctCiUF??NRf1OzG#p%$o0&y@jrt+3Gi z``pF{4SBCXSag{7Y!+=RkJ1t#uS8ipAVO{cE~NWIp~*-3T(na?hrWx~x&KgUPf)Ct z&9F1L5Fy?p(+o~cyF9V*KN0yo)*3Hf#g5+%Ek=o5vS(<8bjM-}7>dJbbYX6PsK{Nr z#HEW(ouIV5BF06T4H+29UmtIIRJW3Ttk6P%m5PG#?wIxSh8$@?L)V^6x+lS>MG)Z) zdU=tNkCct_TW0D%_o&l{0FwaFMAB_73gp!<8%r8AeWA+tXY)1VYsipNZtkf&2uTAA(FE|q4k-Q-lMv0l)C1?s1A zNM2MgN&S^-#Iwa>x~w2mVBmUK5GgTp4J=B<;qfy>`2va@%xIBG33vHR;-APrF^9$5 z?8zuR-uL~>MJ&Y295{Cc7lumR!wW3$S$K1cV|2wF-XXlk#ZlOJG>QV)!V3|=GhF|p zevEP{y&i0cRpO>fkBa_DR|juZ`jR|L!LVHt_^(|XC-#$oC>G}))Y#;wsK9Pqb3{22 z`m>s=O3w1aqNgrj%WaEg-d*nXHwZ8%C`iZ!jM&ErJui5^t>(OhSFnXDdyCeCNP6a0( zpmjTPIqKdSS~dG^oh!wz@Uw=8R#vlTaP4orr08K76e6<8=LRt;acy}@`l71CE@4hj zeu~9-+JMeDPwG+9eidoS!b!>t;Revql{kLaj|n)FdNPx%ORR9~${fd)(8ujm*Ltn`;vO}s zrg=Z^eWCdEgY)mbmXE$4G%0cfU?2t*)9!}1&ATV62aLH;IH?ADhKI+%*1|-uO05Sgm9ct*C0CG>i%-GWEtnF z%1cx2`b?w+fJOwQO2mDN4O?g2ya&Jn#~lt4Q!J%Qy~=(}A84n)@hi`mst3z&Qjb%L zWAwbiyx)g-M0TH+(jG0c5GRe|FKh0V*4LqaX&s!N=>Z!baSZ=an3xsjhCBde4B@@g%kl)CoLh5jF*Zc-EaHp`JGz*uEqAcQ zqh*C{m#5b}06RfoS=#cY&+@r9GSGRuexUtp=C#1|F%ZK9)gEIWN-=rOqVuGsAqBbl&&v+z}%s)l0i5 zvbs6qa+ZsUIXs_-`~H0XREzpzV9;_%vLEm5(5yFrq;DpV!(oG#@G&i7^fs%&lkS)! z-(uysz{BRI1Ol<0d*5qh3nc7HUnENh-H=4{nSE)0I#DBA{J}cP_{hA_-vK^|o{ds$ z5{q#W(@gkICD@QM9lzi{+r@n~)h?l+2u=xP)?1`a@HC{>$OVsMPO|8Jo|W6-be2PH z`eMxx#vLB&)L0qKMYZ~Q?UdNNLGZ+FkBma7Ja(`-x0)wore1KytMM3B=9(#8ER75|9f{H*Ju$0cIilgJO5l%O>!BsG}nCVXX-pkP< z9xg3R`9tY1f!rk;U`ZzZNOG7`A-+A)jl0~(3X|t7Q@1hA6e38ozxo(Ql|`SC2b~+h z=;>*LQcwz+eljCUG_N_PP6h$hmIXZJoc|pp*sv9b0p&3&FC9-uoAe$RpBAebPxJ`Z zFc$Umn&7I0F}I>la#T7T8-{N$7(6QO^WQz8PIZQkG@P%alp`?d%4c=Rt|XNV3dDC z^^StCp6roYm>Fm~wtdDI?TsF125gE0cmZ0M#2&j@>=M`@Z4FF@9S*izAfQMiyLb8@ zUAd*Ou-nYS7Cvh=_D~O8U{V*G8kXvOw-^s5zPaD*FFtLv2P7Lzn zv<~ml##64vU$_?dLgFX0oSSmUs)yRNlIk-|8+OijrmSCm>v5vbLjk6``BWRh_g6J6 zy%}gJ8VOzZz-#5r zr{`4(G5||p!civ>1{IZ5lA}4nNh8v1f;aKruK@J%uPEYn^*5E{7kJAAw>q0>jDxnz zgK2}jTllaMha~jZWx=y7)1(^T|FI*lc@~j4?qSo{gj;1PtIWlqWh8)(V~n1OLr5uc zXHaOFINA>+GS*pIuE#nkK%ow}v#?Y!(uP3#vPjCq>KiK`tJ0D0n508apr5vh30GKH z*u)sISdwvNqA+bvQVyrjFP&e`R$*pZJ2M{%JsLZOk+*ss<|8$0X{u+X3X*Rd$%d_; zF*)+uX+TgH;HALs+IX9F!}ftmRbH;%@bC+C^L@i;xI?_=p$g@|R7-C@Tr&qUrSpb82d9o9u$`@9_Kk>2q=B>A|RkO zTxgasT=K2|W9IKx@fQDm;Hc1&k}#h2c`Ck&G(`t&+HT2fYsft1+&H5Fnce_Rt#3+OAxD>bl>^QJqBJvV2VRk$m8?5a4L~+MxSZ*;m6Fx*V;r zu0`W$t@X7Or+CScI5nmF1GtC^0cIM4-+(>pdqGOg!M39Tj&BS>07iWM7PZ|Ilq>)J z#09Y}A<0JoEioxcAa?K9=k1=_j!+`ZN&xg)PmDjX>f3a z1=!N)j-oAEx0fygST-#INb($)4&(JKkHNlw)Gt5T%%ReAZoYUh+mK2Yfh7}CpP133 zk;zyf55uG76-AqiJTZ_2Hyd-K2yev}lyHL5LBUoP{$R~>5rCS}xa6Lp&J}TgP)eNS zOP~HJh;l`7f4vO`(g)H?MK8->{HlU-D-LzmZuKfV&B z3C|&tQm~cQ_)$N~Gm^J?v2C(ViEHXT;unt5CNkHOQ~X z$!}qB4$zyhWxRJNTsVM06!N=mAx{Jpl&jaEzSdv9h2!y{6=qMb7g-)5$Iiczm&luF+A z^%tdj0D(9kKYDPDZ22@*p7sx z=|eh2IGxEvi4|@~x{8*`9f%CrGuvdDEIu4|mDCrm%rMksNyih&(%}X;E`wmPuI51T z{mA}@QiVu$X7H5rM%JKe37&KY0_B`{!nX~+jZ#~VO7;q48>_%O?_jq@B{en2TDG@d z41Y8g{bKn%yJPuSB=jL!Tbf`jiIl4^NYRgwq&gxe@o6>!Y49~7yq+7bbu2XYg*H4+ z!tOmErck2G#MYo&vIofP0P!#9$OS!qAL3LGVtvP>Meln)#6;esdlDOU(oGp}f^<_7 zv%zE8TMet<6+DMLQVMc`In#3xANnxx{svD7Rv@)6F|DLYoa4MZ^M}yvFMrr!-wCly zNN;DIB`(s7X&AcKzn76uS|#U}o=Bv%w<_Dy|LOIk3-DnW*SpRJ(`l3K1s#?ps4Ls{ znZfHA^gx6}90|55lO_zltR8@!sHc z6iVdiJr(FF3Au#o0Eo$cHr?nPdVm5iXCLGi>BTvNQoexubV@B}@e631qK5D@+uzT~ zm^f$?V*$!*YoV=ZEUAEp-2YvdjM!bUHj)lM#i`&jC-0AIj79Z34M1 z2~sT}+}qKjPr?tdhB8bRt4-F>Fb8IbULIFmwSvRL8qkQp^%ocvHv?W(nzlS0Ko`xB zlOFCMC})AsGLM6U*Q*oSJ`hQq#1}*E#Obp~R3IR&1ksQAo!ZHUpw6j*bljkL?EQEC z9T^0Dk-TsH5@SK50VTg({K!C`pA22a^mo9@xwz$J+s@-(#QIP|(}Jvv5gEU|?4Gz#ueU;9sMmTc$AnDN+}zxH1FHS@ctONK!iLiWqF9pH z?f5iSqN*iYx9plO>TJT!XYX8Sx0bS@SK$h$-B4WMrNkgKb1L|QudHNA(KptCSXZk| zN<_;c)QJAH)GX-f${HNkRm{C-tZ4_jvx-Ac;5<0*iYf;rbaEKC60@7n`LrTGdH#NO z^x&h4!<(rZExhN_Y9#k<;2ari=`HWR@?lOI&J9x9J6Z{cTDXRp=~3-iB3B=0$5!Z zqxlC~s_|F6O0JO!pZ;kJ7v{<>>9fN?`T^8;BgTNOf0pa66%v}q> zfu6T5r8{QR<-;{T6F+O0=xv$|f#pl2l#~g)B5bTmpi--XpBWn&G~&wj)0v=vPnr)^ z4b2~Y94Zo0OOg!jM}ltxhat*AJew&MYkLTKl=)Hy9%|;dO`BhTQlT>` zMafvtZ;rTTUpzhka2fL98#%RF#rXbBQw*WO)tYF`TgM+>A9(~@W3OvKm}UHWEFi=p zWTpYOvip#tXV=eyqSm9GV z#`YIoe|h<&wTT*!t!e!cxMH$*OpN9{BUqKsiT&bj+6uy6pN!o-E6V?akD%ny*769y zQdZS8Tu$@RarA6J#dPuvyOd}(y&`}qJfHGh#wIfOMc$69E&55RT-n&-_o}AWJ#AA( z)N}?D_e;h8Ie9JwPljCx=mxT5l$;5B$Z^tqP$9POXev;ZKedndV^Lz;5uK{IDr3o2 zt;NiADmUy;NcyR^xR5t3A3H_U0#z|fuIu(Qots4o4&@MwNPT3}HE0Ho2}PWsGQ#Yw zcuJaeeiznN$e`_sW;b_{UlrCjw1=gW!*q1ER+l;%@aH8g$_GPQM>f=|lO`@YY&l$C z;oD;7IPp-nG<<8a1can+V!=+bVb3EMUPjm!!>_k}H4eBSuT^D8=ai85^C5l`BhpSU zocp;PrKpcwBI9U{DzC?pW2(i)GB0QLh7~+fK~FIp)r-gm$LWFW-@{T^7Uxkt*5K5G z$<aSPwf#-i%A7uYxYm2_rSs%(~-I|zLGYl8z&L-MW#a;hr2c=EvBd)P; z)iyLFL3%A`Du@R5You0i_a`Rb@rd9>#UI9=?H;(c%yFHUrhoG~{wF_U)OyfjV8Mqs zTOuK^JBf)f&O&Eu_e~TG0?`G8_ZDd8cOx?IB*H0rRGuElu{o^(hIGb{J&Z+4_ zrTP?(`#DYp+g8AT^T+vibA*yu)<`FEv%YpeR;Wa0pL8zn9`dnu?q^>jwfC1B{RJ12 zc>%4lh`a;S*%Q&u&jb`n7T*Ihtos7r!CJQ|QY;Q=E`8}gjn*piz835sGpXR6x;B3| z#&zlqfas}3>mDum0Rmen<7We`}yY*BpEbbVq4Mu17<1q#jX zBEAm^NR_D%6JLI@8j>%ye~6N4qG&he4EQay2s2XzdgUp2%8Dp|3ty!F?v4&SCW1rLVq82IX8wMiXGCAG^qHnUvpv( zuX1FJJ3tKE<6k@D($Qzo*SRHS5M+BZzEl=L8gwk8_56B^&!3u`TPQCd+*d5{bv-~o zNSo@nXNTqRrIC?lJ(80~=zZD?CX zQi_R&KB zC3bnOwpLcjYjkS>I~2+{V8=SUD{jfTj8wTy3eiSRKV591Q9J<*us>nqP9Z&zYD~;ifQQ)cRZjDC!yC( zyZE@EnRHt;hdoF)X%(OJ0-QTQI{W|9H$BS@jk@I)aGvn9*4`zr|Ki$T8?|gS%7G-N zXZhWNfjB;cDsBGA4PZA20Th^DP*B;#BnzN-!(uUjZ6+ho*W2$he4K>yo#Fo1|4t$Z z?a4bu3jaL%jWAhQ9wh)=(jf2-b3l!Ib3nlWSYZ6ad)X!|x6aU=b+&TA`gj^t25MvieBLl!P1Q#XM zZF&wTpxg0#T6j#n^~q}p|L$%^>IU+a-`Vjy0moD*I{>*1Aj;C*oUso8c>p1a$s^37 zfgVDuE&cD>Z5%#?yroX@IO>im)q|odM?eQt219SF-&19YnKUO3$tfvic?Uc7qXUrg zb$WX0>p$CIe+0lEn+y7;K99kck;?~iAN<+*F|2$AcHPP0;pXnR0+-On*l{l3*9z*OY%#=w`sG`s&coJ|L^3k6(jx%sX@TmpTR{EQ)+wDg#GSveU3=ZFc4A(qB4}E*9My!%lPSj35zE{O$kvoD=2gXh5=F{7U>PS- zjD;2637j&l&}mXoEfct}9AGY^z+zyse+)#uLFjDZ8UPICZh*hk2V#2#7LXPwK{$5? zyd=png0qUP=G-1mFfofM0s9)Lm;x&3Jplo&mNS(kRefO( z8^}@uWTIq1elU@bqvHZui%JOHOF(D21mZE%mi)}-oZtl2{KM%LuzzefL;GnUfzVc_ ztT48lw_9uM(h}VQj=>Z>IZrlX0K%L?6=~s+TgM6v*?<3GrJ)N0{N!d3;_*;XRMZTG z(6r;lu+Dh-#lMinY%&rx{vDh%76Ch;P)+Bxt{sU8udVCfPA^|*g?ya_=`FKCLw`RV z$MH&UO~?f?sVOUKTcT)|LRp>ZCasAA`Iw30ro%jt*7;pN+SjLv8LcJVE0=r#4ucQC zXW%Pz0-ME1M3a_ZBs1B4QZ}RM= zWMx&(SP(-z?tg!PwI>H10fQRr&JHt}0mPK;ofd07%XDO$#rO@+;Vsw`lZ;H1y|CoT z1I;=%2NWuRn#^#!IhI3HY8Gl(9yVb^rL1oy}QyX?UMj|SHMKSmU zW4z!Elw&dZ>mh!E^Y9!&*ZBPze!9?$Y+v-IJs@L^zx>B28?N~8=K7mvuI2JO9!sn9 zgC(P+-u6gA0Jp^!R-ie-HecBO-q(73eC$cDA>E|a@exjv{RxSf#s5$CFn^Xffk~jP z|M9n4VyH|FKqqb8d+b5?1=Q&@5acvo0YEjE$G3dV$pvtT)zbpjts5hi1%Tcz0$pl# z0Q2PNj*5j6j-G`FpkOjk&+&kBx(u7d9+%p%`SlJ<9B zAF}^r4642NK=8(HE_^&2G`AI1)W}ew-=-QPgW8nJ2dm`3bX=H+;T@U%Vteqg8Wul2$UYvr0e+VQ6|RO^7yzWH6A*nN=6^f}JN&ns zg=zj&VIrtET4@I%I9061xMrH3UhVx##|He@vi;Gk+fm16y?AISUT}z~>-mq=%2#a( z(^_OdwE7rEXTBKCemarAd}%`|i?YdOu8y`vJdc=7oE?xaW5*Xo7g`<*Q*?b2u-S{N zV?9)wN5Wblk?RSTS)nan9;rYog28%&JM#LKX4Os=2EsQDWh9Tcs?@(I;c|I#lewG_ z%ZP7I*mg1uV(_2@)omf`j)HI_EPG?Wxowh51zv46@e;YWxhn8IMh;ewk==0dVC zGNc_Y%gyjRP3Eoik#{E67w|ZKj3BQ-o@$i|3ki(y@jF$fKn9a7Wc@rDf+la+%f6f% zd3yIDr}H?Lv||rSLuv%nP`|;ASU^RF499>fC`8t(9o&gP7Sn;uqk|3&+{gu(ztPlq z9o?kaRm6Ii_qnc%(#z=Lxl=0{zlm9U=&dW8$UZCZz3?gt?3#K-~tI2al8muPt6 z7hViG{xN>jyHD){A>K%v@NtThoty{PPwHZ#V93Z2p00{He%c(cZaBuLCzbEFI+;8N zOpoB-2TStD8!vZ|&`KUSlnob|D;)_3KIzE#Y%sR+vsP?$QYb4&0< zj88}}Ux{k|HN;>*_5Cu)`pt9qp|YsVsCJ&d=a0H5v7SdfFQKHOjoEAEJ89j2mXTHP~@2z>InOyqE)i^ zOf)6m34DEd`I+9IzH{TmDbg~uw*P+PBte6Y+4_VFqo@Y#3JM`+;dtn zHBXY2UOrs)M{lIDiICx*JB(O9^CLFDcm8%Bo!b{Ctd&rZe*FXg-1fmTVDpu}!pFmV z{vf}Wn4^;+mn29AVeZ{n2yrVL(18#mp7}LVQ^(4ym7BLIRgtIzG@<4{C940x@RZ&? z@9JTBFnGcs_oDf^hsl%I zGVO###N2IvYX{BUp*w`?_xnPtm#tH};b-54Iw3P8ct%y)-gn7o62o)bBnfAKLsgJl zWP)qiL4Sj*a1I{8yA}okr{7MEv0x~T{E`s28rVDCfLC<;0`}Z1`Vi2z--BYTfc;$; zu>Z}-dsJHKkR0B^-4tPu9h8mbOZU#K1}Js?JiRo`X{Nqq*)MJ%763nA2+YF~bTCVw z4o$}W0DaCUkX(RoWm7Vco8s=Fod=+uKcydb)>t5?5W;K8I;+6`ngo1@qtR%V%@ejR zsW7T^OXxPP4bn{79Phn3;VZCgr=cd4Z2@%$YhVl(Pb76U7ETnsCSN022wsAp7n-C~ zm=_<;*?am5wu~Q|uXRw`W->x?J9TOP=rx(GOkWd|DROGG|1QS5WT2hoM~h?nI1jMw zMOUf66ZI^wdV+|yE;`FqHF2|(1rv7uq{NJT>ZC%?4lY$O{0e9D61rVmz$&>Xe^Q+Uqq_X`2uSJ^QdUbk!HG#3Wl=N)KIt#`k!;3LK}uc^`ux*g|fjD6_9B1{q5+U7EBka$xHLV z+~p>>&4@_F{)IPbBFj*qLP!pTd^nfpb_8sFHeNqxQhYAO{41@A~ohm z5<2hc830!|e1Hxa4blz{Zwh}DcD^>JMf&Y=Rhr&RQAeT;P= z0of(t)qzOGBIIALQ@RZWsE0M~AS5{tS5Weml34m*-5k?HIPz8CfKLSmTA#4zn~!|q zXuvFA4i1$Gyn$EV02w)G4P5jeLt~@Yl&fV7dD~mh8gGLGRKGhxP%7nTkDTyCfMF`LfaK? zk%ngUo}<@{olIx*KbOF^jUOo~box$L%|@2GT>9)7~IY0p?3Y4N~2Em>xDc|U1PU+76Aakf!Eh~$fGenUT9x-R+@=e>SmL{AG=~8}C zR`2rsaqyJlc*N`D2rGcjgZgw2@sO%#=I;5b`ai7JJz_>0QekKsuCBE&XZV6$z@cEa z?yM%}0I2KSYin!2)zjs-E{8*#&egvdvw7W2?2g8UqGQ40!A9WsVNbHFAoZIwZg{xu zV>rggloOY*d#qBGgS(Hxf{NV%3yH<K|T)p|Sj_oq7Q^gMe+g zPT=_xHrM;;6?$OpcAK>?vA-x1@BPdmAn@_xNIuwtlK?EA=06x^vCtJnL;34L^_pNr zJHjYP4=eNSd~4cA?E;yo$*fnPvZEhN>ThcTieedqEZW29zt*D#rAo?kQscN@OiP^& z!4aZV0a&*EV9_!DvGZ&Nlkk6^R5pPa3a73P1Ds@02}~NPFF@%Z1b|Ch%PaYkh%=Lg ziz-_pNb}<&DYc)X9b6xMicojvg#7|3@Ut|?Ufz3i;SA487tulE-HkK)-zPRzrurs5 z(a{!gx6*t38?iiv4c7eI|cmAh+1ay-w$?F7GHi8Fi0X~nP zcBSg|x7!s0&j`m*#6B_GqB_dLgPgZ+!efDiP-xr1K+P7;6`!8MejQ<>5}3yo!xvrk zs2VjL{5%=}49l=^PyCVOSX4Ll1iZd}p7)cip=|9BVks|g(OwjbE~M;XeOP{=DHk?@N1|Y7gM>&j!ye?}rO7&TI1jqw^!SKQs@lpi_R?-uHm&Ae_ z&$|)~S1FNx4$4hN2Z1Nkoq=>Ajy{{Y`e8yfZ}QjhI_zJ#)TcW=uVVj> zkPAf@oFbOmYFy4E{kqKN5Py9Pk#drsRjxk|Mwq1{mPRXIALbN%b?+uvv6E0ee|Sbg z*w$`x%|gb)lA}V|Q&M(zuqw5kM$vgs(MhjBHx1L|ouZXVWWyk>yUIY3S#?Nc(m0(9 z-W;%lumiW0%-pV-NZbvO%+nxTA}`PdAsW6uYM?=@pDP6(e z<5j5o7kVQvru>G|**I{OJ@sN>RQ$RQ9NPB`G=8jN2q&~qu}`x+$T4GW#Sd3(;VALs zR`pRT;1^jbtL+GT=$srbT@YUOj(!R~Kesp^7??~l5vQ*0^D|V>Ue&Xa0kxFyzwtvk)XAzFg!k)_Dl(INI)xtUq5Mq;nG#xh>+HvGa;-ii zt^%!(jo&?wPK(E%%pDz~JH7w9j3}ba<3f(^b(wpQnNO|CWSC4Darlq*1`1r#d#dW? z>!jTYyOP)rOcryt(TYd1=}Sy7w|`DG^C#ElEUTj?cOhM;%t@-@q>9S_cGFQGLFajcz^C;X0z~ zOEbmI)7lw2Fne{2LM22z>vq-0<_}YogbUmsw1)zC#`iTIYZPg0$GZNS9mr+ZNQPlyoqaQK4MZ@VN24M8 z7-1{z0(#uhI*0~IErX60m%-ZU=9$kP0?|6=Y?P9qP9Ylk>2C%%MX;y(BsfnJVdLgfw&2D~ZejmP= zLRk3Zmg0`qJHqci1o(Bie&V*bePi8GZ4}m`dJB4gr` zk=%Z<>VH7H_MN6DK$>yO_Udux!ABm^=*#GA12)w{YsGhAlO7gxcNel$&x>&DH6>{W zSyh*h3^EY3THypE?e7I@pM0XEM=*&w!(quHN$81_Txw}#ZfydJKcxnjuUM$nt$={Z ztof5|@?UAf(LK?*QU&@J8D=w4VckBD&pHouYA2r+Br4$Bd(}NTr|G5HGg?1>WN}V| zTX4ftV$WDD}aEnsw0jT0cRDn#hnR3i~nu2gmI5gmR2rWi$e;wswbn zW+6U=C;ptVbOC%tXYHy2E-h5I`$dFIRH*v9_GC_kB+Ztm$ z(BG!3vX!|CPFJ}Pdt2ncR!Em#0TYkJ#nGTfQ7upHo2J&X($aWzLSbn)H@8CD(d;82 z^eNlhmyP_q0AZ&)zJyQE!L!h$4wS)cdjeF_AV>*4Jl+_MJ2&8^qM}-D`*&Sk@%8WT zx~<~(AUmV83cj8Od^&0yo4GavN`0?xM=>g7&zLKk} zYuEiDa(!cHBQ5Pt6V>Tn39>K%CN>l~m_POHTR%{X9yT{Ov(Eq%+tZK^^Ht}cK^AQj z4$Tu@nol0dVht%RYVzGaA#LChL%XR}wp91R)U~{d@55yjKdqeCQGaum6_S<$yk*b?mh_OlLl_d&aBG6xq8)X{L#Q?CZ_A0`<5b zf$i_Z+I$wt&IXpdEQ)53fdRY^$`0%*ct5{#GT@m!Cc&vzH=t3q86MV{5y3I}Z?FGO zcFfBqb(Q?yC=FVRI%yf1gzh;grK{R$-k7cu5FYxd8kY*r3FVi-BftrAS+$uios7J` z9#X*Dx4Q;JN{=`B^ABpbvv@E2KQiqngP5dzw%=ZqVl*u^%R>r)od?ojNh^lwmT!?l zTnFUwFbljW0(L9rsyp5Lpa*Le2*GK<=3op(cP=0e6+8mn>fTo@fk0TU^1COJ%6BZ6 zfT6(a=<>;=blzQZ@_v9fF6O%AYW)7F#cRI`YapIcb`@9sE;7*l2(^&#tnt%sp zXiW~e+wRTZusg|=^4#m@+b(44;8Vi8wEpcJ#*g#A4^mB_e~+1fwpOP22iSKV8QW*9 zeZV5Y1{hQod{~0XAl0H!tdX<)m3|}nNo}Lj$DE3GLPBU&G>r(bq97p4> zPhj9z2=Pv(@8i?lc+wk7yB%HF+OUCwc@4gy4>(*{AXM`FPmtTQhPyAg;z6yn z;@~PMTsUl94Zkm1%Fwz7Bh@_- zvI93@q(pomO)N7~EPNCVgh1}(`PR#U!`_@dvTxI%Q*5J&VvnpuIPj&HY`Hl&Zoq*_ zhjy*;=bo$bi?871gRCMKn|gqB=JnXbx%jytm;w%)5zISXxx`!LQu|+ySAjp~<*46# z_G<$>rJ3pJ0Z{3;27rxPnEi0yg|7kMBA1VW%3iYar(r~E1Q=av0LPDGwrDgn2+T!( z_3yrL19J-veC$Rd%7pu_rp3NWV~MRoBN4*Tm(b+1eiwaE9ytWCn}$M0so5IfW!R~-3b%?KVGM@gF<3r z3KNr)dRvw_z_~;;o4-oOC7Z07&n5TEyXX3?QT11VAhtXfXRT(a6|ZVF1Wi4)%wNpI7b6CM8B!ze3jD2pB>Db(_x<5A>M!apYqZbdprAcL2W&!B!izpv;;4Zf$-?2-7hUu|i*)kntr)M5 zea_^mm)P!}1f6fR+X-)Ex_ht{wlco|~*;wml z5{6y_E)|CEIiNSz0LL`z(MsIHKs4p#YKnfgPIn^^CO*(dhbkrAcqm)9fEI~=Q*7Vu zOC%a*7&thd241XTw@TTLaFz%gz3bll{ma~>0Apagu*NIn$vmxVxR5-F@_qd)6(N(b19uA>~jOOGk z!Y6K85+_&Md?$EYy4Iqo=cv5~`$P5-4Nn*Lu)G_7B<4q@ntr1H{~PMn9c-vCIH zGx_ZYZ8zE*La{Q7M-z+D!QsDsx5V!09Qk_8t(meJ zu^jU>-hfBN2ecxA)`qc(p7!q`vihyhd|hfI1@S7VfdYFef`GdCP0kWlGztr>PA;)& zb0Dspnz{V|Qyz!QS8|fN-K7VJF;x=Ooa9zTU37jsc@pSX0bs%ON4W2LUS%@I(Mp@pDgwdbD0{&RUn$yGk&iAuw8ih>V6dmV|oUoh-ZueJ=}KNwK=L`arO?`{SO#+-38wEGJJbC0DM=r9P5NyP-w5D z;8p2|V0ahWZj3=x_vivVS5rl+AeXA6QNxu|T0fUxs5<*{G~=xE2o5P4)<>iJ?LuhEeqekfn?oy%(Ntn<&}rG$0AlOkTX4wdf7W z!rM?MP>eIBY!MWpEUk7-^x>nv$I-TS-#|BfoU7lUN#pDvU_mB>)m!!JjYIy)rWSGY zd*R?WSo+gh6V%B+7#pLkG22NLX!0&G5VCZSHwfs7M%Alv2Iws9d>+k6DQ7z>F>;<_ zwLLvQv!C)dx}=pjyt-_?yZr5O6C_yRA`}B-yOW6SA${J9_$o|kKR{r|l|H#u&lgjvm zI6{iS)7tZ`ad2Ib>P0}EZkn!&n*A`7zO`1^(79vF)JS#8NwLS)%%?VfU4gEc58pm~ zQ9t_pmXOpVSz<-YoK2+VE0~>{nef8#0}*#pKk=Y+QET+Gpjou?Zalm=Ubw3u{?9+o z7AEa@(Ip}J5Kh5qU1*ES%-%Rt$YtJ_8yORIhlZPDCv9oKdS>ojpD$3#re()?4#^cm zJRKuS?BugX^7o{~73x-oR!cm4M|{CBLOHH>7i{UYgw~ZB-TL5H&@8*sFgwfhA7?9H zD%}7nHD_(~o$vpG7*p1);rAwFgK`XCeN$Hd#4pk6;kxfu*LtiHatdkTHUBP>&C0EB z=D^lJ5?2Q~-S^fJ=HY1|(Qz&&cEY1H5*=(EL5IG6mqCa7kEz|ZDVoHbokvybwOsPm zPSi)<6lBcrL|ad-e1l4GwvO*P1Wjk2SFX7&TBBjMN<|hPm~h@wcKqL&xnNrXta(2V zd%-+g)yAg$gzjAtxm=y*T6A#Y56_7)%rkqT78aVuevL$kUkW6tO)CCTvnXAv!E4d?iO zZ4^yI+=JOwLVbCk=~&M2W!#Pd>^#UHsXf-DLhIMzsd>g!E!Io&UdG&m0RDGyH?`PW zDq%?~m0JJv*;kI(a>&8Ll`D(IoV4x2Y$VCYI;|`lF!R*X$wSRMAO#P z^7=t_>ggprtdHcMlxKSk(B1prf#v(ztAXl%&nA+RR)VJzLS-Yii0J9|Tr>NR##>44 zN)d=#0kX11S|5of_IP+olx)@ zU!5+H4q-@SmzdyT|F8yKXfBWpC7}9G6U-;LwX5-UIXJ%n^v!Sd&Hv--t)rrPzc)}C z#G#~ykdW??k`9ppX_SzZkq&7@x`#$OlrE(tMMRJiq+>t{2|*g95fFs?9>1U8U3c9- zT#IpF=Hz?!e)bdDZy`gnF&r43#(699y^JP}FUjD<&O`74&BPj{jKaJs1A(`c2pF9| z7ub+IYf1vvVWbkMU+%)cHoxHU8Eb(58nbp)HXCRGR~?n#{E4WySBGQ-)Sat`d-G~^ zcEwMXje={YnaH8S8zw%+6xjKib#7%yt!PrZz4_O#y=9N>psoT-;QDVx?IXtHVFgP|RTDYr(CW7n+4nG&KjFJ*J!cTQ>z|cpGxHjet1B9t|fM14(6B0F0 zr|1t%$}ywuNU5ji(vW`89r(Mb8yKVmAN792#{nEbc5M1;&C=)Yx9O*%a4VOPC`-N? zL&DZD4c;Axe{~%M;Y0OE=94-dbG_=6(&P<0L#lOI08TomM*en`aai&-)+ByR+Q zM4Q3Pf)UkuH@1b(%yXgDKmvK{Ly$|`CPR4HL>Mk0DEQ5PjN!I*Kh+TJB$%giLvjf! z8k%z58W3LyF|0VjcxtjHhr;Zcgcysp^5TM@+i2{!h1L_WA_0bglZL3bLyq2%c42(9 zueEAhfzb?(8jX$tKmgLSeO9yFd+2O?0n@6o>KMpV6rltm(5N3SgVT}$#reHWgQxEQ z{4xN-@d{%ob<)A3v=M5sA3S()3p`?Lvwb(q=yyPzdFtz2U$rg)QWqGMO+(*USN8z8 zBuPP&Mg3{_D^TtvfmYOvB>r9&Xt+#WJMiffVj)l0U0_19mR&UWXMV~Dq#`eajQg)^ zKMsM#b?FG&bZ?-3DMQBW`C`Q8Gf<7asSQqQckKW*8BbPL){N39&|~xs52pifqPstL z<6NCp7CMYDMcz*sDAR_^QhdgpECC0w(4tE2dnV_@?slF?I-|0;Pc)dqusGCX8I^s; zlxh3YO`JE3z3m}rt|BS;;JaN>Jd}kzX0Q*?8zC^!OMyJ>(>6iHm`}{Cd zBG`KL0MM5O+MDicYfm--mvWM)$K!l^;GhpUR8*C2$U-(KP@W_zoY7`Dn@3w_Km1QO zyi=mRon;=x5td<8nBdr-{H?64-ni`nEzj18R zU9Vo2nzgHX_YGsf-=A*q3owcFj7V=v=|vLT@tiyRHur)NB|Do+LMk*2HR%R`Ko$Kp zZh0zSMa2+*P+8NHuEe^RRD2Q(3pO|bTtrTY4CmCeh^~VSigY2rmN&KgfVph1P0&RP zu?&}&m+##bWI7N7emZC1Ag+KbY{IsLW27GPQ-2whD!o6>R||+%4>(XtP;h4lrI44TAt(~sjcUmtzTA^yjB zD6}w?Pv^aTO~~j+V#cg9V-Ta8Dh@K9zQpXDTfr9>e#2t>0-#0{2ExSDKe6nv5B~xJ znJ&i{1h&8$FljTOW}>aKgb~58jsJoa!-bN^ZGrTkIEYTbjJv#cA6&F$iy`hMTo%B$ z3lr^Q3L1-aK>l)x_O(MGA-qJ4w^s!a(cYzzeU4ygy}|8g{0GcoM~`m9*C`eyPd|dd zN4pmhT+LiqdtKfzQnH2Hi3kV(mpLR%NhMv;&OQ2ugGp5s5!p--`^_k;y z4oyd9C0y$fSFE$MwySgbe;Qahi`ng#!{Hf^;Er10OS2_1TBI;=$$WJhIZSE6ggxR3 za3z(yf~sG}-@82aVp|v+tq-kS_u7c7G0v}fj9xxg^%$XbbBc&bCg5?~4dD7kZp8V& zYcmadtx$wQ2s{T(ghV7~xubhjH+b8?J?M3iW7x?p^cKWV>c@vceA70O$>4NBwUlqd z#A%X*+%i;WkcEC1K18?>?C5HG4p_4$HwCd^IA~rwFW?|wD>b$o$sDK?p>>_f+js`( zLHD0f^Z6)d0iXH5Gh2!h#=A0vtz-~q_K&3m78^cWusNPD&hJ?dZL^#0B1x-2)58jG z`uf#~GinWIJQvtK27Fl2sX0E`+1fb(h-eSKXXTg3n~(wXk>21FxjBe)wWot!S~}Jl zCBcr(-N4v%=P}A|RPBW~>%+Kfg0L{iM~jP1ION6KXK|!U#qWmiDP8yuGGX9FoaOZd^|KvecN)N202+Q1DXb=fN+%}(Zv_{`KL5PbECsxhLhV^I(xkl z(P2keiGt39lq@-R-^{$tyx7cKux@|8oBM1EMfi7~Zc+ENk5YCtHi6~s+viQXG3Sd) z@>-d^mdJ=NQ*pmT)HX|BIVXeLSBo$4(Yl<~jfsLtrgEXy+9klE0kfMKC51bF!aqnt z(&_ygh;*euOD14sTmly9LI~%*S{jBmx zJ{O8U-6F@^8Ot}l%?&5EBbj%>bXl;F!Rfcj$jM-Yt4xVBXSZOBePknwFP9mjQWhX~ z%eO^51A{$(rmI02w*3Zz{RM!||M`d+P=v1<)mSB^(Wuu67gqhyNT8#102WpH1l5>> zdzxGXB_;06LzU6*9z3Q^R^+o)N5{VB`$r1I(Ib#0w7PG8vTS6nV@8vjJbm2Uz?-Eo zJvPMj^mD$3Zo+)~s&p%s`PpHH-zC^~TG$IF)Yd%vHdgNwz@lzur?iatfcvv2eq563<``U+%7AV# z@1cGxJWDYXFG*zh4sJ58>kza zilS{b*wvg&bMO5u9<~U?YNHz<)V&zMZM$2j@`9-u8w2>V=^K)U?9oT$KO##SCFd~3 zBl(|)Mb^vzp#Z^5&PVF+8U>T_IW*42@c6=46ysy|4=TijQr(E()sM``G<6 zWNiFGg(NA%&M<|P@vV@TNCBunwUV-wlotBh_wG>sJoyP!|0;m?=*Wb}*#mJvW4$%IvVqaT|rq`+eUi-HlMi zw}cB*|5D$;Uiri>4Vl!=m(}nwIa>{oN)t71*Zk~|@bgmE zAyY=Vql9(Lm{OZ(IOm+ArzdOK>$dko2$?|cngJtqhFpQPe8PhC@PX!)7qFS>1m%=s1HaUd&Y$y6NOx3D>Yc$-Cr z%?YhsxHHt7mu^4QTVmYY2o%ADCAV68@@dOCx>jcYod0ttQa$@?O~)UGzQ$L-*w`Xp zq$=tXC?0KI_r*V2zH2xSNCOC4cZJQZmsUTb81IjWhPB%ff-icWFZ0UOEPP2LeWwdE zeLS-@=3>{#6v3!|OBN3}X*T5kplwabLsb%i(aK2#nt(gSbUUR6&~*BGlPXlrIHaXY zH>1%CbW9-O;`c2Js?iyhzVU%C#G&2C%(~U zHg?am^ct&$<@4?8*)<3-%25;HHSj-}muU1YNi#^+EkbUDXzl+{;7*8It>N^&K_9j# zi6?mm5bUI{_$`OAhp(KbitmDJpl9aw)juSqemjxat@n*-4#FOO^Cp#)ioS^L3aGRu zW1qgX|66oqDfH}Vu{Vpa5{K_eCS!KgZLGsm0;@ma#@EFEhY|F;s@0GD;P2Z?lCfM# zDEa%MnAhNJYFFMpC0aT1%*PU?v|OlyZ{w}|#_ zeLFUQL(Kg#98gpUpG`XOXcl(GukXV@`Q}=!%~ZH}k~}h(_5_H+RzK9pZ$FS&dGGK) zZdCDyn0tOiOB$cHDkr{*-nTZ=ao}a}!mtliNO0w5O=*;t^E#$7&G|)#KVaAC{yns4 zZQ>=i7|YAX$2Jn6HTL#Vr`|BxCQ{|IiYaymy0PDsUhd|vEN(IfctR17m&v%k7@6hL zc<0!MiV}--TL^$Jt?u4JnYPHm)RhC^2gM_+2ff~%H8;gBC2(dU7xmPQ6a81Q7?}yO z*6StjkaX2cw$b-QJiU+J@8ahtI~e1CAtAi%eSmT>NkPf@-0MqRC={dn=9sC&Cv2+6 zbiHqMz|seBHzw;~T_fnL%A=j+-1No%7j|^rIbRjtrG-=Xsg9(2_IZ!=c%ZvlvBU|}SwM-ub5q^xTU;79(^}4l*ZPwQ%uY0=4%g7*1$9EXeQU08 z(pO~_Pbc~xcOgD)b*2u!9Y?UzmbwRqU4Qr*wV8dRlexb)vg}!!3@XO7WxHKfj!!M@ zf$%_tTVC`+q~f*VSF4@*I&-qQS$&1{rA>R~yZ zsH<~3x^j1~lT*{3C9b@~HAx?dtRKTQnqzUkX{<41L1V(p7cikN%r~p`)RSAno+H)5 zo@bTr!u-)}N@F*ALOEzP{*>y9dV0{jU$lf5e3yeAsajkd9kbeWoUV zvK>#4_ILaG@gR{%D$+tH{TH}*nW0rtQ9Zi9(r*ODv;OhOGV|s&V^wZKi@1r_OXH1A zJl>P)0zvMpPg@Jnz__V;qnY%TvjMN`oAS4*fZuG?!{k}R7b!_k5Rv*4-Ils%+u8K( z>)nl}n5>3?itEZvqA1K1Lf9O>h95MBntk0{uN)Cwd>?+o#W){m5l#DZg~<@MXmuYV7eQc0J%fBNVvsteAJ{O}WP)YMev&Weq@2DInZz$=Zy9GjFa*}5z7}0yi(|<U{B{Lh|N`EJf%3hbu3|8&pXsPuZ!a$m4-IUk;Vs zU=%zy64j~5R{u3>{eQ?vdk=e@hj6+*961^6Lv7tnGNeRe28Ep#;ln>#JrfqGCFxqH zM?|Mml4a$MeubP+jDuIu5G~a3pc3GaMP7ekeV77>&Iq%_ejB0io^{}LX3kKhYn#>e z(?NHB;~V^GOVWYsMn6PQS+zs9K;0_Q_E~dk6(Oj_jGVMq=_L2|fJdUiX_I@|)x;+b zQ~%@-&l>^n$Ae=t($msu5jIeIKC^y#kmFxkQmSsQMRq@_0Z^qR4Va4SYh;$f-2wq; zYxUEYeuObo?@*+DI;;H+PGhE<3TS_4(w|HFsEd00EC-Ib3{<6zaq{9VYhGTBSYb~) zIK~r{iCRZXrnlok*-44vdbycNQELhSH-L1*R|bl{SW%cgD%AV|HG2nk+(fw zf9o4;bMTTCPx}m&nz!2&`B>ArY6D+!JnRAm>&E-Q+YaXVHJU-bQe_e`ZU$d_fvEh;38NNyg-Lw>vV3pG z@gjBjdL(LN*(t-2w^MixnTc-Rvk?wo6hgU9iY*ChOf#-cn)$!wCAq)BZwvoGe5d5L ziMWyX9eZMQh6lx`7Iy#B@!2x$thL-|nT%#zb~3%DN|P_l3|)yXa`j)Z+$vx>+i(1h zESa^@+@X)?(pi);O8ZAkx?Cs)lz2}gtk!2+VMdc$jLUhgL^NDVH(qX8shaB|tv!mY z)-MEkm1uiHX0Js`+k3pQMJ?c@MZT?fF+$6*`TUrTjXHANJO| ze}&@g4UFyJhf(6qJX=lAyt%_Npy{m%03)YHGQ}_9F{$ZF$Y

crS=wSnDwQ-&5|v za1Tq3-dr$!n!F-6{@_|O0w!7G8%>q{k zUjyGh8|uG5?q6W>S1X2F@j%(|HQFu6h-Z;YS&#~SNN30KLCv(Q>2%s#@xjaG1ZkHP zHd0xG0hDjLqeb**3DUyN~ZNRrT(@yJMal7;rv7U-U)d!v}*Z^+whom!+}i zOvx^7mqvh_VRbB~a(|Q##!E&!c~|)8Kmnblg-)_Pv_FXR-*pHN@&Y>LyVkvun~a{} z2WukKw0n|#cTA@bG6b~uQz)(}l^{zBQKltf@+3cbUx6G}i2yin_j42r=n2g1%b%CE zfLlFwM9*~yHT%~!vjIekit5}2FJ$FBn1ww|bL-5^wwF4|dk3!>w!~{5KQOpG`Sl#N zQjIEHPWr{`7a*ST82lT4Vhz`3+L`|FenL2|*V4#(eWg}ZcmhG;lYjbhg+gfXu5g^p z_qu+B8dFtY;!(OOTA-JWf==pW9pSbz76a?r*bC^G$IbfCQbB*O+(E5G?$IwfNu_pq zlS){tOd@Kc{!GTBjqgT!GYXDI!OftQjV>^IftEq&hyPx>LLf7E9~!nmzxaPIp-u$) zJ%Ew?`?LREGJ;C*-|vC82>Ri_my7?uPyN4_AXEeS05dwuQlP_qFa$p75%dA~zx*SO z$erfXLl9_Z?_0=8fh&^DJvc)$AN+x%}F6vhzlP&K51KGd>IkP^1=VT;-0AvMZDZHn@21!$Rk4NOrH3UDM zJ~;zmfPO)q$FQU@@bQnQTnXyHr|mu6|5;L7tNZiAW_iUjusX^B4uPhC6uAr;qKJ-2 zZShM2weE7~bs;_6&%h4S=r$Mtj^g29XGwaL0-UK z4^T8;2La$f;0)+2tROK25EBd_8J@Cdn7T*Dfs`~7d|$e#>+7sPq-C*>$VXo((8FmU zRNTtO#=!;KLKQ@J-iJ_sNNPR-jy)S4^v!#~nXPZC*3ufVHPb*`-2k9rR>b%NRmTCa zA{?kRss|!weLqUy-`i!EAlcQrH<2|tBZD@&#H;bUvoxKoZ-Y4;uLf-90KH=+E8qw= z_M8J&TiHi&B85RKI<~0T;ZagtEc$-O{#Yaki(`l!TNVePx^dx;QoE}p*1x~6?qMcb z_<#^c@H-_JqTqdT;3|paGS+lj_bF6l zADG)L_%lQ4k-sP`^g9LK&lYx(s4xdKfOt~fE`3A8t?Xdn(jQ+9^teNZ1i?+JDXy1( zz*62ET-|+YYQ6hE!^?ziCPpFk!RWgHp!?|qQr_2U2%6@0aMpb!L3jaWE_cQ@i--X! zz^wT(b0eczmY3XKXz~^Ybg1D`Paz9)9Ctr!mq_hsJqVc<>Sv8)e6liXM{E9k5r1Q{ zIuEEEIhX3NR*3@u66zb21M(%`6j<;x0S%ax0JpF4ul=4&T^l&@JREBlC|v-f`NjY5 zmjCai#2}06txStA#RzBtc=Q0g#6jRa3iUqdMXy9hevRr!05Zr3<>>doan*K3F>6|8TUW zPwpl>gKK=j{p{Id?E5SNzg>@h=2u%X;CM9u6}0`UvfD=neEQqo4!wSF0qhd!g;WT~ zJ=$s781X*;@xO3Y9fj=JoU=)LodcY%fxAzBYrk!K^$UFvDa9YaRrHj2(jJVxiDquZ zw&`N;d=giqiTI9RPiX-rBNOoju!D|3thr{dasNb4w`6lb_RPKAUZ2#sAePuL6_{BS zE%}@%t7e?V9%#TEUMl$ZV&6%SMT3Et*57a6zHQdmD^i?w$IDQpcO@kfo>_dkyWCMK z9j?se+4=rNA9zpjoOy)Uh!d*_eog_}@-pCem!hOr8B%#?vijiQ3gG`VzdnqWx z*T#I1DQq3HlM);k1re8%fXHy&`#@JuulO>Nu&!HZA4>87)2>1vTvETg$fkW0>_ykt zDt!ZC7q`gdC~167TlPPaf*gh*mAH}h>s=`e9uUvl&ReC+j_-N(n>$nf+)ORwvnRWy zNfMk@S~zGd;wONF=^&ABWgOc7w~s3?6&&iv)YtQ$BiLP)&hir_vCpLyj2XX2H zA_qj7QsEn5z7jyswfF=08t4aAw;aT*#>d|LymtcPdCQqoh4z0S<)UIGn1R0}N$g#m zQ~{q;u$aD%$q@x3Rx*;v$4dkUiW0*D3P}`KkU!$(RQK2+n6;IC2R!F0`=t{Q?To2` zSi^rhNQ{0Af5QFM04ei?(TsDa#gXOR!LXC;_IQxr zLEVpaEe@0EW$?MBxw9{QYRlz9&PTR%?87E_kQ^mLv^)Me-W>N}?4`_w3;qScC;rd< zVQZ^KeUzZJ-f&X{c#E3NH+3+iO-^b1A#-i%{kM!Urm1h;d_WN=MWly`&b1CsRscE( ze=ve`q!m{SR?9KC00X^S1u)C$H<`?-OLShCpiGhCW}_8{26-FuN{PYGyWyjEi}c?E z&awUbU5MkXg7ecx2y}MnT2Zi|^cCE}RzF2EI41%wtqNB*9kf;e09NtFZ4ha4Qwu6Y z;y55@Sm-C91#i4Xb)&4pKJ`y?;xM+I9nptf&7FGHhnA`id<Emfrc!2nc&u?D$;4w~pB;gwGQ#K`MPF9rG#4@z5L;M?g@pfca_xkl-_ zl6ZP;yd3mG6xxo$PRFfr=>;h&0oQj|4pLYT?2O}JCP6~2u(R0)IGt9WlzOw*KuTw* zoZyga?uW;QTP@O8^|l|+KeU$EqO>z2Cr*V2S>%W7YsqcM8z}UxAv|(ke_K<^<70WH z?&IG6v&Gm9o8DkujLxeKa~Lmjxwcp(&_*w=qw{|eyX#U_f)2Be_jy{~0n{%Jyli{MCk2ngyVAL0xX z5rJunCl8-P;z2S&pU|Ky!Tq8^57b5Fk&z%wNXl&_SFa)s3#GOu-qc{0i9_4d+Mq0A znz>_d%sTiCg%Nqk!6wa!Rzshv*-FzjU#uOg8fH zE*Ty3I)W>LLy)x7B6TR&^!1tAi0W7Wn2JYrhvQcBM^aVneO&6ht0Qg)+@1%n)Wiq9 z&qlQI#T*|zuUOx15nW>%Rf|Wga44qTzjNYr{I(D_fY404F0A2rAKgvi(-h+}-@r|G z`Az{{|6H64G)p#tjC8BYeF$7S8v%Z$+#puEVRvEAyd>I#sB~6p?z(Qf_8)98>vGS8 z6ac2RaiHqVK4FYg3iyH!1Mql0RbgR0*1vyGQ7_9u8ST*mGm~2BR^;&(UYy#LYP@rA%7XnY}(;Ix|q`4SyE6V ziLc^alKWI0EJwyTk#*PO(2tH!iI(7RdK!a1X;Oh7S3nl=0D?Td;IKm26Ls90&B&pY zfs7D(*gLwyIeXEsSa)AvP(OM9 zxi4|V9BnkMg-+$MT3=#a=&dObR~Gho#m-g#S}%>o#I{Mo^7-X;`kWbuq`p{~U8D0L-`3H{n)_v*f%SUlmy|CL(#pDf zYmAu2P*Qa;GdZ(FQa&Z|KbwtHewh z?XUOMjLC8#Zy|R8Q`5}c^idI=2+<4?|LH?;tlfYybMC=N4Gqk%;>Q6%7LI zYOBIv^`;?rAil4BUuP};5%mW^?P{RV;B|=I;b|1Q1WL9QPymgNASXjbh$R#@fSzMQ zDTNz1yTA8ZRCZD#QTXByc2c=Nn_DB+m{j?a@^bgG0ZN}IfK>3cR60Y13OH@}V-pIy zxl-?(uC()OMaSW@Wr(mz3<=Ks;DANCyE&{TEBG>=-kYD2YT=Oh8PZiH;S>{Ae38Y1 zTnm%DHs@EOj*>?^vn3!!piLW0w|+8^+#VvT_u1O|@D*8m!L8z36_u5)&cMUlxG?XP zXHKE2msgsp&&t`spDiIZAD=9y7)#HaItEb28q{kxdvL5oJfsDVwd>*3bk@Isb;3tR z|Gf_P&S;eZx7SQ8Yi6#cyi4DxY-RRFu0{XTrGb8H{Gsq|jbXWSC)vK_i#g!HBAa|Q4kqZanwaT2=$ z2_z&}GlP0TfH)7JX%qpjmfHcS6Uz$#ONgv5m4|W#O#5BaCij7LuZ!{%(PrXoy?-$z zNrM{YOR4vQmRQwN!)i_w4oU_;hcYKv$ZZs-EZ3^!rgweGEv|Z&%7w%iaPQ|@jQ$?k zk;r0xiFp;w5NKxxIWTE2wJgQ(i88~vfq*L(jbAgqxcUI)hXs)DqblC?ThF(Ac#AJ)Zh{0wWbd!ae-b?YEl$NF9aAUEn;#4Dt-XPT`o`@U@TV*aIv zYG%8BWC7#-W=bv}HZ3ix67TQ=+Ms38FY1>(;2F>%X60_IqC_H4AZrpo40_>r=?<@cR8AiDHSMv zlzfjj1uClLkk7t9Wu_BzX!vsn_8d4lE%z|Eqykr{F1m!d2I)2qox65?BAP)baPL62T-EIh~{(qX-hndqeR-J1)u<1i52Sfa2S^O3Hv1r zG;EipKCgfi9J(k2dzulLNz?(H=AV-q=$s48t4O%uIk9_;p)j89S1WQVjPW)a>`(DN z&T&XtDRg(&w9h!Yu$-OEO+0N6?wGj?(!u19q@Uv!Y3I4BC$OTYXdZT+`c4n778BsB z?REeK+v_U3<_?q4Yn~bAfdwFON`Cphd|iW?h|CejfRvd4a+aPsc% z!(zi~x#sKe@N}!rP!R$g>f_|x;cs~c?8#Yx`DrjR(9WW;aoaGTqg3-1X9|~gY5`w- zq8pjqEyxb*o19v}`sg0gCRNtv)MM=6%Okj)e^X;i*6EkO)Y!-&ph7C^4+5R&kQzg` z*6#F-V|CSnm4|`M@-v!&Ql*!gBqe!S}_tbux1VFy5-^JJZhWLflV#cC+@c zK!>~eT+*7GsiB71XPxK6=F)fTS5@yP&9M}sk%n&uJqk{O528XsSlcD!2nbIMr~v9U zVIXVvmxF_26Y1N+@aZ*7c~}O^SG-pwh2STT_k31pD2@+cHXhSmc;Y_DFPU7_)i1k}7b%BW-ZPI`t&>ou^o6-ZiG{evy-@1jT-+a;o1W zqX9$l%jkDk8pYqia$^n1$WFNzv}DGUbi+iwy}h1kYn>+TzWC9%<{*JrlLe;W@mB@< z=*IG_$L;<&Hkao#(gJ_h3xb~~62d4}`JP)wEM9}z8YquCwjW-CX1EUA(;ItO#7wxM z4N(j7=b%Asu*FikhMT>XKZ%9g85gn~0sxVu`HNS&C{%TAC*?AACf6|yZvHJM<08L*? z)U9F};e-)9Ih!4k)OMx=6-KvkUIT4Mw_nX$Z-w2K?%7W46kS4OPM?ne`(%?dMl)e^ zk_&5NS%&`ob^jddT2Wo-jl;16+sjS>(-x;tfEZI_&9LYQm>1Y(jYg|3b}OYvxzPcX zPE9P)uJYDk028py$lxYSV-d;ljv~9mO!T8-U9-S6ro`2n>V_7U2Q~-XX|xqTK|Zk3 z|L~Z7W5-x*(co(*z-+wjjmI`5*U$<&^jo1A7uC|6dr4M^pOBmVH4d0a+bSg1^zraUq}5xP6Cc+Nzv)wm;U|q&YuSkx|8hbYqzPM(J+9x;80AZcd--Hl_JMxD&Wj-tK;)0u2J6j=e zdOC5Rxp08EJ8>^e&D&OWthP%{x0Na)f87X_K~b(|-1m1r{X?+>QowDZxHB zkmog(kqozM6Wg*^?NZ**LJ#M{W*Kf}ku46fA)X&Jf2`m1`t!-t@XGceKb&-FKkgH| zW6naha22z>7GcQBe~m7S2U8J*9->vfiy+F>@djdkhdYw$JN|cCBGpo2>!_+LmFtUN z-+Z8Dxdvy)Iol`y7Njkj`{=4l*3$8$ec{CYXY0K_1gyhG8^lH23Cq7}#2JD?AN{KQkGcmi z9t!17ZHJsQX_Ee_juhR|j&6p#Az3yscAmJB@p5IiTE;|TVVcw*817&9@^Q>xK5upZ zjVhAox?33}_x>oZY7RGOL1~vzooc#1#O%7^^1b_(-JRz;M)aw5w>UG9(9?;pf#NJ; z12R3^Nlr^kX62h=!fXcGH+72PbHF#6?XN{yKsNtXD*}9J8 z&?lmOw_bxf-;Qj`y9C(G4ZiB>ctoM{uce5pY|-o6?K8fL3IoW`Sm`J#{kyC{i*B}8 zu0SXq-qU-|fz4D$TJPWT0=VZ{s3hJTv2d}9iwTOHY+UQ}aXh432RRm2J%EoI($0-A@R(yVJ`PC?YU&HYg z^d^o;!`32FY1_5vd8X=2R6mA*=cTSKtM~uqM^aB6P;t-^$QK z?}k(aZQ*J@SlfJ35kuPzU9X<}-YquUR_ z{`?Leijw%#)XZ~7&0HhBQx5EGe4=kUQ;|WnSx}R%Z>fOy z?SDry^G%Hl8p!di|NGO>P8C#xupKyt2~GVH-J*|8HTa?9?~#wb=?sp}n4D`YmEll>9s@MO0SyXn5=f)0k!M`!UFQu4zoCDvc<|d$! ziUDK1r_dpJNtNgeG(CVhVFoDu0OAw&tNXuA^(7FldpO7|1~yg5^zW^tRJveR77VC`pmTjsG4G%K9jxf4z1*x_6SOUq_Zf6e zs8eH-^K-1hWOGbw-H2f2t)Sq$de{;Q)H)PzoH0J2^MS4&GC20vmH1|lN;`#V4Bs8#(KV;|(=nzg8>h1r|GLB#y};L zI~WYABH+NEnLv!Re_#MSmk6%_^j-j&;kqAxH5>plFPabIAMBi*Qo*Fj$UNX!|L+BG zZ{FxWr0cocw0Y-_yyijkD@AhXdW)(nB*+pg`}T}il!}U$c;Nl^8N*!vnn;!TuDPM% z>niikQzopqU`_Gqd(bfOuN-8o0$_c8$s3LDa&eaU1q9Nd8G`uyb4#Fu_p<_K0pot3 z)s2g%mk-KB9-ErZum2D_IsHyQpLe1jc;P`2H=Z@e;cW*aC_V8h z3*7ta>ak6N!TpB-!2i$JON8w$(l%n@0BjWGkPj)a^z`(4fjcl8E+OfJ^AK0V@ym; zYy~D%NpM;MWi21jopdx2(pgSoOMKpTG>DiOtU7rP?W!BDYrXQK)-;^bgs zQ~G#yAD4|#p4&V10-BznVa~l+Ouz(KjQBT*0f~=ofD`zDR=^ruCVD>feYm??V8)gH z`ng7ql>Y9??!w~-md}_wXJO*u>|z4S2hB!8iu^#;jP%^fVhtd$xsL`+I0?j`+<@Ok z`Y{cGpuM^Y&0V}0^V0Xgyu2iauH*zj873sy7tpZD9fZ!Qd3n8Sgq&V1^;_U^+z))2 zPSqu-3S5B2ROjj~fzDF{8DuAYY99e6bVQTIvPhg#l2bmm=|c)YUa! zir$BGTly2rKr**5A!9j(c+4s-Z34|ZB|)6M*sk6Wh{(?yc%UKl1x)^`H=vf$9aI4Q z4ff;3H8naUVIZZa=EIjC@ptAE0rj>FXuji7w^5yVS-@PpnU0|5AqthJQ56el<;u4~ zrM=QkB5doRu!;Lk+d3+aQSt$pg7ZKluEyu~H!T?R}FjK!2S`SY41ekjJv@veQcaL^8E6YWuPX|#s(G`VHxlAAT!F;HH3yc_lu*>@ zdzXdPT(Qdo)6Yt?=tX-6*!{k3sVtEp^#9hyj+sfMA)Ex)u45Na0%p2L#2mlz8-arI z-8=2r%yc2kKS#u)UN1TUf>i_vy-P?RB;WbehyWeV*uC>2eL0U+4gEqO-7DbLKWN(T zd8OuKU52;WS^qnd^kzA-e`)E94&f) zi_o7y0#}CO)XuFWXa^i;AqWTv-x#TcZ_F%xO)M3 ziasP^UA>tY35jLqe9V0*&3!jGTG#a<$>4-5>&R*JP>Of`nK)8(@aDQKw>kl=zAJtS zb<3X3fj3mR?TtA-LuFn+QCBfH=ijl`Pe8!6Wz_+BEKzgG;&4AU9%u;l24>nf*qclUC7|{S;2c%Df^nZuMy^~umExciW$kJxB_fGu_N5_NRZMlQf zuK*Lk>d(=J1J-S-yHcNywTu<+Tme$WG60Ys`XC+yt6&%#C;Ucwx_th25*eCMmadnG zt@Ed!j`2%9dr!?r);f}Hiv2Oa_`{wK^x0yUNM#`C z0G3dmO@QLTi%WG3*4BzDE1QWTIGx`M{ZnqEo>Oq6I7@<1NY4_5hGbJljUH*h#qe@p zfFl3Q&a>#I>lVyanpzs?`6#Z7v}#K%;th$&G7RFk|4{6E?6ZEMg7iP__`yQ@LnsBYVbl(u-(*MyP3t1P zv@AvP7gUoNqQ4TT@;Ev%(Iu-&qN%z}AH0n$9;tCL8s7I14#S69@M0iI>CN;C01%dO z1y{7i#jH@(Pj6dJ$RDzLZV?V^$jx%<1#~hB1XHGq!T=rqmLZRY#Hgb}#W{e@bS$xd z=9iQhaR+tYreqF+ny9_OQ!W82{y>7?46JnVtyb-Oj9F4YwTIY5{#;Y9$TkmtZZU|U z#p%}gbo)7c&6X&A&avy9L_=-!bIzWGrB#*Fi?+ z$woiExZ#rr9D!4n(Iwe1{nHQlNcVOb`hKx4r&=9!R{p?WzyPsLT3})e=J;`8efo`> znT;5F#sg)?HF44V&zvs;h=5w+rA`QT49Sx=KLkvtuIc^qm#%<*$|_Is?L`lM$G0C! z8+(4HiLt=;4V%;T_TzjvH9W{v7g$J+#-MUhV5Gg83e(M!(HZG zQa-VDWec`jt1Nrll@IbKJ5&YJyICx}I$AcoFgnaYMztoPBZRsV{(jJn~*uokJP|*XqbM zpJ-%+eri-l$v_1yy!y%arRRr)fFC1{&_M*B#`J==aX7f!4c@|AtS0lvP7tlvD!mrQ4(ziv4| z`aP=7hvdP-uiUMKHflD%3!hCbrQgKF{~i<>=>;?py8+U8@gny?9lp^(K5KQYi8`gO zM-;9HYoKg8>3zRfjwP7hxAX~$cjz%=Y;A2AAXT@S+{#7#MbFfAJ%Wx}3##I$fg3?J zjV-waYyBAU>-KKvsV8tgn4vRIv%zv)W3X>x)d$OhVWkiC5n>;`>LSF5<d{VI|QB@k7{*B z+y=e*`8lj7?~B`XjR5;_EAV7Zm^Et$Cc_4FdZ`2vXECzpHd8X8aA^C|zi34O3@jU> z|MO4-rHsrtQ1LMQh4ay%VG`5hVXW=fGAClQcNOQLY{5KyS+w>=@I;3i@Nm?v^T8f7Na z2%fKyKEt0GBv2r7U?6Xy@g87!EYF9Ald z5SzQVjzc(Q6Xb-`+M_z5X0kN9_L0qd!~l9R!a>&cc@ZVzbE}!Xr;GrEzvlxweGDtE zi8s{ZGS%FsV0b!|3)=kdE*PMGaCD=ugF)g9oi9P9U;^N>fo^aYwjnPny>lP0f=9X06DRt6wcm1GVu_wI4oI?T^_!CbIf2_DmPrPkJ298*hkb zn$kd{>PK;7u2Di~&d!AP%fCDMe?2~1mRo|HIQU`%8DP}$gP#sD3l&h{`M&A!>d5J= z2U)L90yYcZ@GnMM!OV$)@*gU3@2PcB>QF^SPk!3ne=tB-JntG|Nz2s#b#~^8%|}eB zW#@JbNZYWO)B%K)2#k*Es4vkR_y5HpcEki^zcvntO5GH((M%KibfK}iwZ$fg9`iwM zuzJu`PC{(|ENnci8o@{}5Qx!A|`oym!LF6#; zUOpE&R-u;DhQU1f$D5m*M@&JjAqtN^QRa?-9@O_m8%X-uoF5$>-F^)mB6jJJCcuQc z#ADNX-cx#(o5?;ydNn;&z0Dk-g3D5QbXo1OFER$?N|r~^U#AA4L;qPU*SD8h0JXC> zlo8D#A|kTn09QwXCSew+VjW;Ipp^f|XM{347npt5S^xE65dZ6Fd-@@$W7sIUw34`T z>A94<4;wBD{+R^4pLLJ*+?_X*zau7QZwfJj(?>w9s0oB(QH_AO)I)fKX#L z2;3^Y?Q>uS)#mIVqd8fmOBL3S0tG=FuiB4bL^c4Xd|WSNe`nHnqQ?x1C9db2D55=g(@;GMj2hDKD7F&-Nkm~2Ko3Q@ zP;5*epfL-cZ`Z$onkSG{$_+(-I|01g9dni)aRUruDc-#-{w zrRdLti)b|#6#+Qk$6sm`oK6BAD@d1y!lKchBt=#CH8qpWpEc>mLRel5h9GBp#zKhz zNq9?E*6ic1?kLp7y(O?nz-lE}Oe5d{f#6&swu6a)W&ZRlB|+E{XsX7jaW$3-Rj`XX z5K_W0y42ySj6yo_k^i3fp~tLcWTMf0FVJHsg<3KOaSblY1cp0Wk=KFQhR_FKmg||t z;?z1V_<+_fu_!@GEkR<5$jjCl;`j1E)7orMj!CuDuvY~FMlM($uG~jW10aw~<^RhM zuD#VRT!L5k(4eI`@f&p-g0dFw7$0#w3*xC-7=70Zn&SpQ#zOO%?t1EN`JNFSter{) z3wsL`s$h9_byMfAm$K@rMp<%;ImrWa{bF?rI&&;v!gVnVA6nwvT-<(6>#tv4bSP@$ zwvrY@fQG=V& zx+#eC?%ZFwKuQm^+Zkr+L_J8z*~>^vEAbk`&>FQt0T@q{iHU7NFna?M`OSR6Lju(x3!A3-J&-sb zR#)Q|51#1|QLvf(5snSXXYkMvFDw`3R_2#BRRlu^@t$IYjURaOz+*e1?3!=t`v6Pi z23}MR6w_z+N%gSwYU@oB?8ZE6wydS4rCvZahq2M4uN_>+6#G?s{)g}3y=HvIr}5aC z09*b2qkb`9$nycrhrHrXLAT}jy#qXG8n7t;pW5C!EUGwM7gbbH7{sEx6zLRckZw@A z1OWj-8fk_cqy%Y@ZY88t8tGE05kzSiQW~YR4rR3|D1bg?ZP?RhQqg8}8w`qyBw!&0Q`L zPi5;5Hp1#;tE%l!y>sjbORNRY#56xyCysnYs-wT2z-`Ogs9bd7cu!^`M?b^<^r*7t zf4gz?pHECs=?#Ok#q_`V_7JE07mmA0evNjjo8fuc;tixe_I_9dZQU30t+6I5?Gtzz zsz&#z%OJnqc`@C6CFL)GTB{64JsUN1uj+0^a#-uTT(zTX3o%qvdsUzfq-42^nOmVe zElwY5&f9;H|C~8hmPlZJ5m{3~l3q<#$->4qOz7a)hJS}*oOV1J?T6Zt;9FD>War4pF}}o1&w0F^(y*1Z$V6)wD^1aTSBIu%hUg_ha~>@l z5f|fjcMPJR<_*#KW{^ncNmW-|Xun~Js6toY1;P0MN}C%Q+;;oY;@y|V6lr|+?7KM(Xng*9&Iu{Tc_@&c?K4uJ8NL>`nO(q0iJp%6ju}7n#7Y6% zRZ|ByobALtUSt*|T}S#JOxN~w0d82}ZCtKMVzczUo3nhTADcJr%k_gh!Zo)g-js%P z-E(5dsKLygF_29roAxxhl{S~!iA!e*_|QbPNp$ljS+Zr2q8$PRqTd7H6hrMSCv6QA z!f0=5kLiGHfN_~zvE8%LIx~JcM0jNbPIliF5Gxyl^?bw{oWa?Ud&h#cqeP7>lcqxK zS>1C8`Ob#?ySL3?SRMKi;t&U5jC%bLEQ7wU0T6cidDFo{cIn5Tto)$7aq|Y3WBze0 zFJkN36!2$?x8^#l&U+pLFM~uFbTRyo%br}Oc0+5XPAPNN%AC~TGgt3?6A8UH?$|bv z8!b-ZOq{aNMlPKF_1-V`U0`FsZH6QtjUB89Oema_Tz~eU6S7VP)9koTBJ6tCi3H~( zYp-@BbiX#@3Y137H-xACFYy1?)-@(}CblRlDpHv6D8o{WPrd@aicuGz^xZQi#2VQM z^ew@SRtt$;b5Tb}Q)e`(Kpp9ABX^`YgMF8JhTK%UFNI73XiZ?ARrsXwmDpcBHxaO^ zMH6{q`w(p!zzUGvcgZ*iu#BaJIlwP;kytP8UUQ(QT@h;2M9yK|jplxW_49+WN*yE& zRD*iw2Be7yYkLj)MzAQ#_I-%SK@(BA4oI!M_)AV2?05Ra#QNua?MkQ132;3gN(4_q z#qd4KXHx_TsHwrd?mhs&^G?ANTQ00!f>siRnr+diD*oy;>W z!HlouCiI0x4@iw&UyC1sfap6B9pyMyFO@*v@7GNqTEy-d>VM2&yL|*wrzMBa4lNE3 zOu$E;?T@^8H(N8CW%$%CcrFcF{RFS0T|vA__u1J-X8Yc5ATJHEGt&JA+DpFWD$Erg z9cV1Dc#Dn#N2BZ%Yfo+&UwH^)65BCq3BQJg3*_|DHBcQ1P+4ZW`!@jTE`dVAjbO&# z9nW)ib4buCo(x_|nv8M4I=Mmqz@0xjl<}MN3f0?CeX; z9hxU&oc60v7A`v`9)pm9zU<*^$iRG7Oh-|eX1O`t^32nm`NBwzcMrcwvmsUDy%8~v z3sXazBVtodi{d&$<;)x$=M6;L@$Af24MmN7Zu^yZ8cFQ_4Q(F1NXVbQy<@oc!>imp6+ zm|wted`S2;R_?8);I?Gl`~~ExyA22JF=t)~$PW@u!0^#<{VP;RCLN8cgZ zmNm)gD@&5kvHT?9SsfoOGcR>K%cLuAov)hgG=Vj`Y1MbAaF{rLm?25Akh-!{>^fedRWQeGuFp?)%vDB$R?YJqHi9XSYNFoE0AI#EQOdd6BX9QRL+?E&VGQu)Xoh{k3xbB}CxMHs@d` zgU(45!1`i%*hV|Y^x~3D=+0NVm)JS6%qNqXFnzXd$K1r*Lr{~E<%CiK{qL@C=#Lq> zEf6vPe0c~6q?X{a@Q>=qTj_|lz1`hgHsN}uNR@r*5d&LQo6C7hMLl=-`JFOly@@F( zDJ_QD-|{}g2r#j+8K;h){Om2*yr0ZACXf(Sr|xcG@UHABo!?&(>mmZhbmcevZzvee zhQ{AapMKY>Q84=Hb4SHIjXNs%t#jr?1eywX2u9Bn2p|FH9U3oncZi#_`NR6ykV2QH zD%qZ9TaC}Xd#If-6^fNKiKGJWhD&Bf8Sjk6JF&@@&HV(zpyO!PF9Tv}z3#{@!va~(*yoU0rxBA*! z;d}Xs(-Z!mIQSMF$G9mHg(d{7T;zCm8a1Be;ZM;$)2D6Bx0GIo=ft4nL(z}j^cCJI zO;FBG9)f8|o}K+Z^SouUAY&HS^{Fe5-_AXuPmy@va$tKo)EyCqEmlq|gW zaLc%@M)Z0?J0`+WOD&DmWVSW<@*FGP@m5lA0Dtks$?euJ?~*DRT7#+elN+Ihg{+;3F29GursO;6RC-xFc-yR}G^!ECp~?KI}mrR5s_lu*qT z%O%ze>zc1YQ;>X)*ieQB(_!WrBd_CGBx;lvY@?4d^DO7TYXH(0umSNz%k`A=r(Nx@ zi~(7Ubz(L_y%y-|yQ;L4xMJiznN&jh97+&(Z3(w5mMjWEZD1Hrzx^`S?8DK7(KW(3 zO9e}iAY?WocqmdWm;7`SLU%HY5XIF5g6wU)T+)jp?9#Ni8Kk!rCwIq;8ca~cO)9c| z19#J(lGpjPCz#35#D~aQEX`kMYpZ0~Gwf>U>%Qh0BRJlGVDk}j?Z>nYoYRv*4PvE^nZ$r!!S)Sx)Ok_~vp`K=rj63xYoNd3NRXa1OE9a5%jSBd)>*;J zl+n?nfte8xwJa}36gNPo(xlaVi?<-sq_MYE@75~4H6ze4m>K!Bd0i$F65bi>vM2Ps zn5x*WZ94t?NZ z=&BmGJK<8x9ejs9$zWBK(xy(-*=Ev+EK}Sw{OTMg`vC1~lWUQbdzy-t2ZZrF%Ux=FBapktU_dX!b!rL;E0qT3;#qk#n^C8uKz2vXQTH{T8M zGSLRaFxvO6#Zu<;>*T1+W_=p?dvP69o^)D7uoJQoVM7jk@YJJ|Ze`C#q|2Y#Iaw+Q zYaXDj3?T!_hfaD=zpec>y=7Nu%VWLunf|cuYsd=T$g9JU$3$u~P1hXQq`%@2Alm9i z>1nvh`RMO(p8R`9>Syecka8-e;7S`PO(cfCeU;+5T!^jP(9fMCa4(h*oNQ+v3;k?O zZN-NLGEsxyLD`LMb^nK`|9_I-W1s&|QU8A_eE)6zziRxkfsfcb_Wz>s$EM)@lidBs zP9mAOFQ=xa=7H$_>?3KmhC*OVmI*cS{P63YPomTAFHd2dVaB9ZD0ul@aWMuA`C`Ya ztmKUn3FCWW$w>Fp`K(@v!VGyE<_ZeWpFf{h>k1H6OXijWM^1YiYwMRaAE~=tJ?}0j z&d!o6D=YtT(bi6&2YOWqhzlBqCRu>|*JMZbMtl^;bs`wpTvhLfYMr#{k)d3qsUA+VkCB z?!ki?G+GOb1yH@lk=Xh>!qmq(MF45- z)yyO^sqdaU#_}lg1TW`qLO5&;Xv`lrzpv<|z;c7S0ZaYKDaHVAB>naOlx6qmm%sBq zgqLHmdFWV^+@FC|JWqnLxFe8A*s-`LR-fq%66|hlS~o;_$3iabJDdaf{NGQore6?o z(*IMW9J;B#&jA-0D?*7?Wo4m&$#(6{q_Q@Jt4PLkqFs@=hHz6xi4ov;rLx=mqsY zz)OHK&C!>0TeXk)lAJKhCoo%38pe8kY6T1)GDnY%jor9>5%L-AH7~s!CQ0NQ8F>f5GQIR!Dh=&E{*}rjln{ik&%(zkJ#}` zpG7ikhcRG{ezCc=sYpLAT_ZF#ECvt*BluNN4xOhMw|feVk*vz~spjUZkLZ*UfuAP5 zlr3e9b_?iehd^84&QFu^rux8mf)RZRuP9{06=0|xvyey^vRXwjk6R1>tb%EGv=+dyIMI|krkz>IYjr?<}PFz_x(wI zO}4g;8t#Z-XkN;Ruz&!kcekbiN*(4r0;E!#2I~Qg?UM~X$BT=LA7MOJ-WI+?-jeUn z5YyK042+eh&uUk&f(UkP=2s$0+gP(>=;95pitn3JA2q{g&xaWIO6a8(fC0st1atSS>m~YDThbV8d3i#?0E|>SBN5 z7a|sAmia7N_JBJjz<^6cQz9n=I3yP5JUqc_m%RO@f`>w|77q8uEN-qDbm{4XJHsE( zp3&K?P>ABl{62E8KV*>CdVns0Yhv>l7n_(f4vG*8+HJ;YGK#!2AX#)nmW|9ZR{dd2VXNjAW`yFscBoY>3YFSj- z*WW+N+oI7ukS#5cX7!`oEcNOEAKcGK)a%JEvSi%J@OXfYq8fv5S9>s9gPiec31x@s z>NX%(f zo7LYH_qL&nFTyyFb@9of_Zp?fJKcJ z$?7YiC21R@=))pc7pFPCA2BK|DavAKa10K!1OZY~bu5=r2uDWWx(?0i$TRN?{`coP zC<*s`$t2v*$%Z;JB5MY7n7R0@zM)-vxbUNz=QBt6N*{Z%FHw&HexfCgoxGU*GZP*J zgi^V_ce%{U+P^yY&ezw-dD%3wo4aMEzX!r^`_9+g*C?qzZk)ZHmc-R~Z=TXUgMjg> z`w)F|UIcGfGf_n4U*28bS*}hZGc4I>`5ImuL!PS1R*)#SR}3C5VQ;#}<-QK9vvd#c zLvHpQDU$KRQ2$DESR1O<;5?EHSqml5=yjFO{cZjI;cNN11pF-PC9lU2CP{`s7_3we zEx)r8r3?uf$_`OZx^FWqraD@!%iy1*PoeRQJ!E%==Lz>kZp6zs5xZjZR)DvSkcf$0 z!m0qPF~<=qM;ia~#5I$sYWa)nP z+*mB%-VSzxurqK;-3Lpm6nAH@S*spa zOC>d!-bHZ-OR>@B_m|^(=3>?jR}sY1v~7$Tt$P!zu3vYkx=Dhpow7tzB=;fs=U%9} zQRbqV2g;1tk8eChqi2Uh67ksj2!I?+7xIC)*5d zGW&A}iV;)sHA`Dke!B|XwDc6mak$%dr7Bok0kfZ$3FnFV~7#TV>P zA9%z3sfhXRalQymZh(_@-DFCOIAmFCufTmc{o~WQ0Nw?1ZmPdO%!#zE=L576EGnRZ zy5IlLc6{g%ZdC&BSwtmUu1kbRmw!1g$-~#~Z}wUAVmyr;ku)yZ0~XzQcHDm0VVTpA zSpVi_zaBi}C6yu>HBkwrwCdz!(thsqt!4@_R%i95*XT%5yfFFuacm?iC`P`!lr_(D zADc?8{f)(ZzSrdmdLX}lBO)O1{3oDst~#J1eNSHoEz17QQ|-UnOf1Gp=CFG^dwfeC zLZq1x5iwlb(hZaMTe{~CPGn@vm&>(r<>XP{9vE7dA1gR1aqRvNUzlSH9P(Z*9S#nQ}5kL0yM5f+EehpMphbr4m zOBJf%MVj#N-IOW+q1%lfV%Jz0Bk^us+w&FozwZ-`*Nc`Qq$+-Uwk>5ZN}hzqIw*eQ0{Gh3J6M@AyWB*%d#*d6b=E z+*I_jixZJ~m=(Q4dx+F$Zv0|C7)j?k@gdqb`nWG*y7h!v7XUSqOW&nULAa62 z$$RQLk@4{%clGD; zb8l3(QX1Lxgl#&(g5%#7*i!fZYy`ISg}XjO?7|O9{||@kZ1b~@On>=JcWD0NnLd{= zq_0xE`G)d*F~yI8mnY}-eyJP5YsrlQl|>B)|4oz+mL~a+I!w-B&;CIKrv87}FE@X! zoaz|NL|y*f*^MiHDvE=+@=9Yt;RyoYbg>P}Pha1Qt&M^o_g zvGv-tDDH~|b9V`Y{ohpGxS~{DUv7)CczfagSdJlR3!pyZCVNRKIJ8rL{cwkxo31m8 zvQXsM>E#Wl*-*#d=emFT<=UD-FGWrME7bG=sQ-BM*Mt1c{9zvd_|vN8>MW))m#&TQ z_hIPP=LPw8_0+U0EwvZz!Yy1mV5lg}`}7+7W4rFGxX&n&>Vaiq)0-K@6^-0hLe~VX z_y^1-9WI34qlPrIpVH^Z2nV}Fuv5mK0C1^_FumruGz>NEU5GZXs&jfrzlh7d@ws0g z+m8u@O$yl+DWXdQ-k*(vps*HSzZ-)+ii_KSq6E6$8a$VmtY;+nDAh0WtD$I z$j=+z*k8H&VTp723q)zq>}&(cf9#?(+})Wz!=Jx3O=xTI+;I(+`Pa?;+m|xo(T$SY zh15EV7TEvNxSpy#yMUbzKe(;@+bH&2aZeO8ijSBEXc=O+dLqxmtIdCYw!;*l=$EEn zo->mc9Kl(qAbzs@NQ8b6#-m{{!jB6)uVG@s3F~n&wwuD@o2vnGY&f@?ADeXJ%2J`) z#Vka0LCa&t)J zbG8yzQ@^`0g$zk6#D3~(BCwlsv`&E<*l_d+o51D_qn{p7(sK5T?nL^)fW&K;r|>3Z z+8uh`AFTYmEfo(wb31Ms=S;7SLkD)4;z}nX@@wHL_NbEU6uUc;QFs99+!6#9Sl_S} zAsisWL{PD}i%D4c-&m}f2X8sIT=(;qYWK%QeKLjJ-hoQ|(OlcU2k7tv0 zR>&C|6J^M*sKYKfJphhUQ+*@2zxnwB20ZZ}hNQiIeZ98oT>M|`T&&a^yV$vswm`6o zL3l)j_81ag>@YnJ|HMMli=4A&8VA5)@6^T12uRH6d_~O$xPGqR9bkVI=>EGxOn}UJ zk=wo*5)ZMolU!XtztacCf2q`=q82QcAmg`=`3;R?<)qiD01+c(xAbBOaS$;1wUOXo zc;#tI=w~1QFJDw93{63}@d&Guk{bo?lNwIdk@xOWw##MvhAp{t^rAOG@|pw$7HThj zX7$ICL_jfq?)ilItr8HSI%BgS;N+XIW7=K$u8cyE=ky}=7ir<>lBH9Ee-OFrgad@a zIYXQNFY#0wpA44_0nef;s(}}F#jGX^7FuT@Bs@zxgZvY6cl&?11QSSp(#a4PI**=T z$--`3l6-r4{)Yzk$qG@0D?)y0+}fRy6ckubv3r;5fTrvgn8|u@t^Q^mEz6V23f^;j z=C6e0lE^9K3^jNpSitWz@2uReXYHxTBMRyWQ)z{#Dn+k%zi5#+D0w6#RP4gLkur5S zcaLzJ7=|}%NOnFN}+e5a{9t9Lz=s|iG)kit3g%T^}ANRQ* zyA}m+74txn`S$JI*ips<^+1f4Q3jugpz1>%NtY%UvWsOZf;CDW>u4{z6%T&mywJaK8=X3_URt`W=a@(S5fYjB(1+>SmWC-{hSQ zeoh+X+$<8c;~roU^D0pEi{8c9-*Uwdk|a3z?RDi|zS7gO;+Ii*K`0gOYw^EJVt^U< zvgQT5k0m8SUr0rVC6us?YQdu3`A5BZV%vyy5m8UtTesK=M&*aYb-}3L&elX4Hj=ne zUrg4;^wXbQJwIUr%t^(*^1DPmO5tpy%lbAC7fYsI;bUTs;q^>dLdnahMn+$ZJO7|y ziKza9CLT(+b476P7DX(5HJH?^;1MS2YmTIpBW?G>i=|lUA=c1M{r}lW6MKRDpOL&k z>j&8}CWGxR%JHV$EKyNWrKm+3zs*&fXU`rAmK79?v{80e#t&jt<$EIrn}Q#p%_H7l zhAtBwh(2W ziMB|7eeIQM&yJ}n!Tw)RhcbLoZ-=US$xhiDg@V1zsh%y9IR&SpGkk-9L zokDx%R!Diy;ga$7?K7im48N7>v;94w+2r))xa6B z_hj$P5S9c+tP$mP3W^mBAY3E(uG{lrD=4YW5ySWIM2sH~RzF=k^j1|pHvU1om(#Y* z^ZR2KFHs1EB|vRX#On@VzbVm|%(L1TL^RnblVHbi5YaWe{^7-xwKp8qyO4QZ0)~`} z1}*0?FAjI#I*(ae*+xuA%laOzjY3ZQ%ra$TgA9jG(P&%w?fr7>;T*~ zK&Y|?H0DwB`AHcglNa#SEiFR;(9s4`5hKO{Ny!XMO<9-wtPAxUr-gQP%PWzb;*!k9v|7Iy)%8kpvmb?fR>Qwccz!;2Kcc#@H^kMJ9b=I@%hDKvFkum`NC_|FM>?IgKcfg{5LPE2}N#awBV zMxqTSyZ0cDj0FJ5(k3qGB%6HBP6U7Mn7W!2Inem+i|gM0>*)67^w~wnC@LYPhmRh8 zF!F`?V}sKXJ@YIor}op=bp@KMxq$2{uJ%OTGmu>!fYVa7@1kXt*V^A-?}wcg-Y80zh<(KOI#Xv7lh$6>w8zqOq zW%u>j$%X`l1ZKN#mDBCduAQ%XhUbE2DWgLzhV|=|wr+Z+D+)^9>nx(xl@A|2Oxr$+ z)+vDelS(k4?L)tf`y&z(`ru$#eM)3?5BZJmhc*NT;T-G^?|rz@v_H#dY*yE&-LTjV zti)Vm0lC}w2PjI;QExCxdBv6{Xsx@t)fMfwH_N75=d<|rO)eJ>R|lkbD5QIV)lR+hIzpzOIYTZW!Vowm@t1K4|Y)8z&_7K8~v?ae*pEDkT6TFl z_((l3!#R%O#woo(<`sI-+uPbmCASzX9MSomrzz*x*HwK)1&`gMbR)C#!$7GR$5E`3PLWJ3>>7O}XaI{bJ& zw=Ygi3ZWp~D69)AlSMLIWia(J7F56=X7MJ8TZPE6(wGNO9Xg*q}Ec@<~0-@J=Q1zOW#g6|Z zuGAW>%Y4tV34;R_NSl)0<1JZ_L{(`AkP?55m^U>mVmaG5d!f53z{NGCpZiHzSwww@ z?0Et&dODPHzrF?cjrAeC)p}r5t8q40q3O)_18G9}`WqB>KAfH66=S~ic^@vz_W(}I z4DDSZD@yt*3{E=ZhtHWt_K=^$abZ!8WX z8a3lV{pzMAdhkTuID4WJJqy!S(|va%8_{H?nijT2NMWg9^<2+h<|B4j89(=yPwgz1 z+}&n!ROwlt>=>xM9jcR-VwJze@_oB62Oi_D+OzQ!X!I?P4WB}lz29SoNF?IvEHE$( z#P?U{F0uB*iVK2s9Vbk=#vB{_@BEF9;495Vvst-Dxd~-3Yt+$-x0#UDZ^9%N ze?UCy`j*V2-AB@7Y)+atVY6RezXX|~sb%RyaV0g>m zK8q{<09gI9L5Y?py7_r{dnuQz7XL&$$%OXLO~dOC2OVl=3ZDJpm1(JzsWZFVO-5t( z;E8A@YJt1Carj{_a?q>&it0dOX;nT ze=}+6nHsHRhHSb~BJZlB)di=HX*t@AU{;0#PVHHBm%mqPcE8?BhM$-JpjwX)m=7}? z-&a?+pFDp!htuk2ww-~>=It*vpYel5 zJljvKQ`pQFXI&C5f^5lP;(}4g;p3=?nM);F_#`2|-7#i$G)eS$LvOfe+)G))xI{$y z^eZ2)CfkApBLol;_e2=28Pqq!C7eGZFxOe~=X9%UW4?{3%QnE+pgb5P$m%c<(=~2F z4&%SEdL!x?YcMC*6%-y`p&HL&o(?L%@HCHEAX$V|=ouI`CH!f@n{!jiEHgsrvWg9u38Rr4jR55hX@lOSV&P64gpWyxHRPIo>*2 zH%>0tAWYbR;VH?MB@^YYmL~EQ?HOuB|I579%a=U9UYd7r^5?KP#9z6c`>@NtvM2F1 zSt5cqadH5Nn^KE`Jn@nnoR%K-YcEE4&(M$Ns{Je?cZxhlyPjbCN>iAI=8>O5v=t1C16AS)|Yd{6HR1kV;eGb=obYTjoSn>sAC3ZxH? zMAZFsPAxSuWiRn|ud^L?8zP_k0;J}QNb34RZt0T`qZ#f^58lc6?4=FzWHs=%rOuQB zRrm47NiTV@+Df+o;_~;A8|E2^D`rKfixQun>E{pSMnaXjQ1@VZKzeT^w zs~x17BhIR+d*eIm&pmk1LN9)6*1Zei?Vgv`Q5hsNFM59RM=%kPGRZ#folZEB^hMWe zg3u8B2p*0F3lX9kR>Cg2J28=Y_lw2eLAeo^hEjLTEq@^~!uu$+9VRgbPLJREtYDe^+yxB^k%oAA`*O7`oGvN4Hq^SwCoJm<%vWK-5@*PU>B*(xAmIl+IupcTiFFM z{39cZ%?Xu^oYVxpGbKIed%pA=&||nYlT*Gr+$hWP;-|EYpW9?Vxg~IUNy=15aPKYI zTS=$f{LbU+tTKqpy+mTVWF4>|K5R>JmKEd@BZ$9*P;DAMY}Lqm!gSTE^at~6UTH2e z%jDif{CHj!uP+y^5HGr8vQud=Q8nDjx&}5iP41CGy4u>-_ajID20Q<_RTS4GE&No- zE((O?cT-5?RdjXb>`L#29)|GB`a&|s#pO|ltVE&}W(v+G74Df7UTHC(qGB=LUdr@CDi%Nazj2$sSzKExxa0J(gf|v(6czPK(uyPWwU7Ms8f^oe?1V?zrrPga1FE!9sBs3I}G~+Q%j|^p?h+P*0Pl-@tr9y5<$mWkT4cU<8`}k*56%1v@?_Ood1Hyv zwa6|jG+z}?VKcgVp^WNjy|Ze)R+m-j7owxTyOL7am}76Nu4dX$7}1T7oRnw@A*|JZn8)pgHw$f+hU4*j~5!wUKVaEN#9?p#}x->S%=%Zed3$+Mhc{_*AA+meCB;uc3Ekx0oRINmMSXGpFy>+-s()H=bN%i zg5Hlt#s@+rtVBwluM^rVa`_Sfeng=1Jap30c&&{W<+deLzI;`WYxN`Vn;`R#Umh;B zecck9*GB>^yRqdb$LC#p=;YIJEmGWwkn&?@qud?R=Lt7BIXG0$P$wb7trB@PXUC08 zj;Bri%C2Oa%Ln~2v9nDg$P$lfn)P6UMX}#W-t%2kJAN&N8*To3pEegS^;3G&?$#f5 zKOS#*JRA9!$j2cu+SZA;ZR}`!Iw6}aXo;1)Zah#quMt$@BCzc;O5L?nA&g`>00zg zdz%As=^Y;mqie;qPc|N9_)c_)O=s*aPJB)xJoVC-4c%(ya5D_@-pCmadxiID$!a#E znV-~wr(Vy5%_dQH*YQuV`#GGrB38nZ$4i7 zF!_Koy+9DBK_XLkiue_M>!M!1Ovqg3%})zHviaE| zeYGyKi|S!MBR|H)PAHuSUWYzy9=}pZej-T4usz}Fmai(|K}#cWjXIC|Z|)QWGI!Q- z-}$=npxn(P??9yA_>+m;Gp~boog$+ePvrPcGjV3^MnYEqP0RMh4$Aq@z8hVO9rXwM z$B*fZX^BWP8)>KA3x6N` z2yG8ejZ=LV-gh1CHS@_FUE&fX3g-~MdX~~`)OT{?z7??7_xSjr2JQ~x`};mr1V@iI zCy$y3ZM^hl7ReiEH>SEaCJcSgCd7Jiown+o`a}0*LU)k9KUvn7d>jw@Pu~;awPRN0 zHx{<~LuY;u?;*D+*QSpCY*@3K6o@-qQ{zk9({q;;3`~lSrfjZpnjuNQvwi)Q;BngE z-nlbzt{PA2m;IB`{SofAe<=?L2DcoygqWFSPicP|ChSms(y>1W{}g4_WXc|x2L3;D CbBSjF literal 0 HcmV?d00001 diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md index c0c32594f2..996ef00a6b 100644 --- a/docs/features/disagg_prefill.md +++ b/docs/features/disagg_prefill.md @@ -19,6 +19,18 @@ Two main reasons: Please refer to for the example usage of disaggregated prefilling. +Now supports 5 types of connectors: + +- **SharedStorageConnector**: refer to for the example usage of SharedStorageConnector disaggregated prefilling. +- **LMCacheConnectorV1**: refer to for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission. +- **NixlConnector**: refer to for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. +- **P2pNcclConnector**: refer to for the example usage of P2pNcclConnector disaggregated prefilling. +- **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as: + + ```bash + --kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"SharedStorageConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}' + ``` + ## Benchmarks Please refer to for disaggregated prefilling benchmarks. @@ -48,6 +60,19 @@ The workflow of disaggregated prefilling is as follows: The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer. +Now every process in vLLM will have a corresponding connector. Specifically, we have: + +- Scheduler connector: the connector that locates in the same process as the scheduler process. It schedules the KV cache transfer ops. +- Worker connectors: the connectors that locate in the worker processes. They execute KV cache transfer ops. + +Here is a figure illustrating how the above 2 connectors are organized: + +![Disaggregated prefilling high level design](../assets/features/disagg_prefill/high_level_design.png) + +The figure below shows how the worker connector works with the attention module to achieve layer-by-layer KV cache store and load: + +![Disaggregated prefilling workflow](../assets/features/disagg_prefill/workflow.png) + ## Third-party contributions Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors). From 766bc8162cb37ad32605eee051d4f049ec325926 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 7 Aug 2025 16:45:04 +0800 Subject: [PATCH 064/932] [Core] Store only the keys for multi-modal data in P0 (#22198) Signed-off-by: DarkLight1337 --- docs/configuration/conserving_memory.md | 30 +++--- docs/configuration/optimization.md | 83 +++++++--------- examples/offline_inference/mistral-small.py | 2 +- examples/offline_inference/vision_language.py | 2 +- tests/models/utils.py | 5 +- tests/multimodal/test_cache.py | 51 ++++++++++ tests/multimodal/test_processing.py | 48 +-------- vllm/config.py | 30 +++++- vllm/engine/arg_utils.py | 22 ++--- vllm/entrypoints/cli/serve.py | 5 +- vllm/envs.py | 6 +- vllm/multimodal/cache.py | 95 ++++++++++++++++++ vllm/multimodal/processing.py | 53 +--------- vllm/v1/core/kv_cache_utils.py | 4 +- vllm/v1/engine/core.py | 7 +- vllm/v1/engine/mm_input_cache.py | 99 ++++++++++++------- vllm/v1/engine/processor.py | 17 ++-- 17 files changed, 325 insertions(+), 234 deletions(-) create mode 100644 tests/multimodal/test_cache.py create mode 100644 vllm/multimodal/cache.py diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index 4d5c961af9..dcaf1069bf 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -86,7 +86,7 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", If you run out of CPU RAM, try the following options: -- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB). +- (Multi-modal models only) you can set the size of multi-modal processor cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB per API process + 4 GiB per engine core process) - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB). ## Multi-modal input limits @@ -129,20 +129,18 @@ reduce the size of the processed multi-modal inputs, which in turn saves memory. Here are some examples: -??? code +```python +from vllm import LLM - ```python - from vllm import LLM +# Available for Qwen2-VL series models +llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_kwargs={ + "max_pixels": 768 * 768, # Default is 1280 * 28 * 28 + }) - # Available for Qwen2-VL series models - llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - mm_processor_kwargs={ - "max_pixels": 768 * 768, # Default is 1280 * 28 * 28 - }) - - # Available for InternVL series models - llm = LLM(model="OpenGVLab/InternVL2-2B", - mm_processor_kwargs={ - "max_dynamic_patch": 4, # Default is 12 - }) - ``` +# Available for InternVL series models +llm = LLM(model="OpenGVLab/InternVL2-2B", + mm_processor_kwargs={ + "max_dynamic_patch": 4, # Default is 12 + }) +``` diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 811925c19e..bb7342c93f 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -2,6 +2,9 @@ This guide covers optimization strategies and performance tuning for vLLM V1. +!!! tip + Running out of memory? Consult [this guide](./conserving_memory.md) on how to conserve memory. + ## Preemption Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests. @@ -126,62 +129,44 @@ Data parallelism replicates the entire model across multiple GPU sets and proces Data parallelism can be combined with the other parallelism strategies and is set by `data_parallel_size=N`. Note that MoE layers will be sharded according to the product of the tensor parallel size and data parallel size. -## Reducing Memory Usage +## Input Processing -If you encounter out-of-memory issues, consider these strategies: +### Parallel Processing -### Context Length and Batch Size +You can run input processing in parallel via [API server scale-out](../serving/data_parallel_deployment.md#internal-load-balancing). +This is useful when input processing (which is run inside the API server) +becomes a bottleneck compared to model execution (which is run inside engine core) +and you have excess CPU capacity. -You can reduce memory usage by limiting the context length and batch size: +```console +# Run 4 API processes and 1 engine core process +vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -```python -from vllm import LLM - -llm = LLM( - model="meta-llama/Llama-3.1-8B-Instruct", - max_model_len=2048, # Limit context window - max_num_seqs=4 # Limit batch size -) +# Run 4 API processes and 2 engine core processes +vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2 ``` -### Adjust CUDA Graph Compilation +!!! note + API server scale-out is only available for online inference. -CUDA graph compilation in V1 uses more memory than in V0. You can reduce memory usage by adjusting the compilation level: +!!! note + [Multi-modal processor cache](#processor-cache) is disabled when API server scale-out is enabled + because it requires a one-to-one correspondance between API and engine core processes. + +## Multi-Modal Caching + +### Processor Cache + +By default, the multi-modal processor cache is enabled to avoid repeatedly processing +the same multi-modal inputs via Hugging Face `AutoProcessor`, +which commonly occurs in multi-turn conversations. + +You can adjust the size of the cache via `VLLM_MM_INPUT_CACHE_GIB` environment variable +(default 4 GiB per API process + 4 GiB per engine core process). + +If you do not benefit much from the cache, you can disable it completely via `disable_mm_preprocessor_cache`: ```python -from vllm import LLM -from vllm.config import CompilationConfig, CompilationLevel - -llm = LLM( - model="meta-llama/Llama-3.1-8B-Instruct", - compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, - cudagraph_capture_sizes=[1, 2, 4, 8] # Capture fewer batch sizes - ) -) -``` - -Or, if you are not concerned about latency or overall performance, disable CUDA graph compilation entirely with `enforce_eager=True`: - -```python -from vllm import LLM - -llm = LLM( - model="meta-llama/Llama-3.1-8B-Instruct", - enforce_eager=True # Disable CUDA graph compilation -) -``` - -### Multimodal Models - -For multi-modal models, you can reduce memory usage by limiting the number of images/videos per request: - -```python -from vllm import LLM - -# Accept up to 2 images per prompt -llm = LLM( - model="Qwen/Qwen2.5-VL-3B-Instruct", - limit_mm_per_prompt={"image": 2} -) +llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + disable_mm_preprocessor_cache=True) ``` diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py index a38fc9216d..59ec22a1e9 100644 --- a/examples/offline_inference/mistral-small.py +++ b/examples/offline_inference/mistral-small.py @@ -166,7 +166,7 @@ def parse_args(): parser.add_argument( "--disable-mm-preprocessor-cache", action="store_true", - help="If True, disables caching of multi-modal preprocessor/mapper.", + help="If True, disables caching of multi-modal processor.", ) return parser.parse_args() diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 16bb3712f5..5dbe001994 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1565,7 +1565,7 @@ def parse_args(): parser.add_argument( "--disable-mm-preprocessor-cache", action="store_true", - help="If True, disables caching of multi-modal preprocessor/mapper.", + help="If True, disables caching of multi-modal processor.", ) parser.add_argument( diff --git a/tests/models/utils.py b/tests/models/utils.py index 4657df60b1..27ce9de469 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -9,7 +9,7 @@ import torch import torch.nn.functional as F from transformers import PretrainedConfig -from vllm.config import ModelConfig, RunnerOption +from vllm.config import ModelConfig, ModelDType, RunnerOption from vllm.inputs import InputContext from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs @@ -257,7 +257,7 @@ def check_logprobs_close( def build_model_context( model_id: str, runner: RunnerOption = "auto", - dtype: Union[str, torch.dtype] = "auto", + dtype: ModelDType = "auto", model_config_kwargs: Optional[dict[str, Any]] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None, limit_mm_per_prompt: Optional[dict[str, int]] = None, @@ -279,6 +279,7 @@ def build_model_context( model_info.check_transformers_version(on_fail="skip") model_config_kwargs = model_config_kwargs or {} + limit_mm_per_prompt = limit_mm_per_prompt or {} model_config = ModelConfig( model_id, runner=runner, diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py new file mode 100644 index 0000000000..e07b73bd25 --- /dev/null +++ b/tests/multimodal/test_cache.py @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch + +from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata +from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs, + MultiModalKwargsItem, + MultiModalSharedField) + + +def _dummy_elem(modality: str, key: str, size: int): + return MultiModalFieldElem( + modality=modality, + key=key, + data=torch.empty((size, ), dtype=torch.int8), + field=MultiModalSharedField(1), + ) + + +def _dummy_item(modality: str, size_by_key: dict[str, int]): + return MultiModalKwargsItem.from_elems([ + _dummy_elem(modality, key, size) for key, size in size_by_key.items() + ]) + + +def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]): + return MultiModalKwargs.from_items([ + _dummy_item(modality, size_by_key) + for modality, size_by_key in size_by_key_modality.items() + ]) + + +# yapf: disable +@pytest.mark.parametrize( + ("item", "expected_size"), + [ + (_dummy_item("a", {"a1": 100}), 100), + (_dummy_item("a", {"a1": 100, "a2": 110}), 210), + (_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501 + ], +) +# yapf: enable +def test_cache_item_size(item, expected_size): + cache = MultiModalCache.get_lru_cache(2048, type(item)) + + cache[""] = item + assert cache.currsize == expected_size + + cache[""] = MultiModalCacheItemMetadata.wraps(item) + assert cache.currsize == expected_size diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 508c773b8a..cb489c47fd 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -6,20 +6,15 @@ from typing import Optional, cast import numpy as np import pytest -import torch from vllm.config import ModelConfig from vllm.inputs import InputProcessingContext from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs, - MultiModalKwargsItem, - MultiModalSharedField) # yapf conflicts with isort for this block # yapf: disable from vllm.multimodal.processing import (PlaceholderFeaturesInfo, - ProcessingCache, PromptIndexTargets, - PromptInsertion, PromptReplacement, - apply_text_matches, + PromptIndexTargets, PromptInsertion, + PromptReplacement, apply_text_matches, apply_token_matches, find_mm_placeholders, find_text_matches, find_token_matches, @@ -902,45 +897,6 @@ def test_find_mm_placeholders( assert result == expected -def _dummy_elem(modality: str, key: str, size: int): - return MultiModalFieldElem( - modality=modality, - key=key, - data=torch.empty((size, ), dtype=torch.int8), - field=MultiModalSharedField(1), - ) - - -def _dummy_item(modality: str, size_by_key: dict[str, int]): - return MultiModalKwargsItem.from_elems([ - _dummy_elem(modality, key, size) for key, size in size_by_key.items() - ]) - - -def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]): - return MultiModalKwargs.from_items([ - _dummy_item(modality, size_by_key) - for modality, size_by_key in size_by_key_modality.items() - ]) - - -# yapf: disable -@pytest.mark.parametrize( - ("item", "expected_size"), - [ - (_dummy_item("a", {"a1": 100}), 100), - (_dummy_item("a", {"a1": 100, "a2": 110}), 210), - (_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501 - ], -) -# yapf: enable -def test_cache_item_size(item, expected_size): - cache = ProcessingCache.get_lru_cache(2048, type(item)) - cache[""] = item - - assert cache.currsize == expected_size - - @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize( ("limit", "num_supported", "is_valid"), diff --git a/vllm/config.py b/vllm/config.py index 899862bf54..44a8d871f0 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -444,8 +444,7 @@ class ModelConfig: model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`. """ disable_mm_preprocessor_cache: bool = False - """If `True`, disable caching of the multi-modal preprocessor/mapper (not - recommended).""" + """If `True`, disable caching of the multi-modal processor.""" override_neuron_config: dict[str, Any] = field(default_factory=dict) """Initialize non-default neuron config or override default neuron config that are specific to Neuron devices, this argument will be used to @@ -1692,6 +1691,31 @@ class ModelConfig: def is_multimodal_model(self) -> bool: return self.multimodal_config is not None + @property + def processor_return_mm_hashes(self) -> bool: + """Whether the multi-modal processor should output hashes.""" + mm_config = self.multimodal_config + if mm_config is None: + return False + + return not mm_config.disable_mm_preprocessor_cache + + @property + def enable_mm_input_cache(self) -> bool: + """Whether the multi-modal input cache should be enabled.""" + mm_config = self.multimodal_config + if mm_config is None: + return False + + return not mm_config.disable_mm_preprocessor_cache + + def get_mm_input_cache_gb(self) -> int: + mm_config = self.multimodal_config + if mm_config is None: + return 0 + + return envs.VLLM_MM_INPUT_CACHE_GIB + @property def is_cross_encoder(self) -> bool: return (self._model_info.supports_cross_encoding @@ -3369,7 +3393,7 @@ class MultiModalConfig: disable_mm_preprocessor_cache: bool = False """ - If `True`, disable caching of the processed multi-modal inputs. + If `True`, disable caching of the multi-modal processor. """ interleave_mm_strings: bool = False diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3e2f03d56c..a18cd9dde3 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1230,17 +1230,17 @@ class EngineArgs: enable_multimodal_encoder_data_parallel, ) - supports_mm_preprocessor_cache = (self.data_parallel_size == 1 - or data_parallel_external_lb) - if (not supports_mm_preprocessor_cache - and model_config.is_multimodal_model - and not model_config.disable_mm_preprocessor_cache): - logger.warning( - "Multi-modal preprocessor cache is not compatible " - "with data parallelism when there does not exist a " - "one-to-one correspondance between API process and " - "EngineCore process, so the cache will be disabled.") - model_config.set_disable_mm_preprocessor_cache(True) + if model_config.is_multimodal_model: + dp_supports_mm_processor_cache = (self.data_parallel_size == 1 + or data_parallel_external_lb) + if (not dp_supports_mm_processor_cache + and not model_config.disable_mm_preprocessor_cache): + logger.warning( + "Multi-modal processor cache is disabled because " + "it is not compatible with data parallelism when " + "there does not exist a one-to-one correspondance " + "between API and engine core processes.") + model_config.set_disable_mm_preprocessor_cache(True) speculative_config = self.create_speculative_config( target_model_config=model_config, diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 9762a1de9e..02b78f103c 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -163,9 +163,8 @@ def run_multi_api_server(args: argparse.Namespace): if model_config.is_multimodal_model and not ( orig_disable_mm_preprocessor_cache): - logger.warning( - "Multi-modal preprocessor cache is not compatible " - "with api_server_count > 1, so the cache will be disabled.") + logger.warning("Multi-modal processor cache is disabled because " + "it is not compatible with `api_server_count > 1`.") executor_class = Executor.get_class(vllm_config) log_stats = not engine_args.disable_log_stats diff --git a/vllm/envs.py b/vllm/envs.py index f6c6d7e7ed..212eaf015a 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -65,7 +65,7 @@ if TYPE_CHECKING: VLLM_AUDIO_FETCH_TIMEOUT: int = 10 VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25 VLLM_VIDEO_LOADER_BACKEND: str = "opencv" - VLLM_MM_INPUT_CACHE_GIB: int = 8 + VLLM_MM_INPUT_CACHE_GIB: int = 4 VLLM_TARGET_DEVICE: str = "cuda" MAX_JOBS: Optional[str] = None NVCC_THREADS: Optional[str] = None @@ -561,8 +561,8 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_VIDEO_LOADER_BACKEND": lambda: os.getenv("VLLM_VIDEO_LOADER_BACKEND", "opencv"), - # Cache size (in GiB) for multimodal input cache - # Default is 4 GiB + # Cache size (in GiB per process) for multimodal input cache + # Default is 4 GiB per API process + 4 GiB per engine core process "VLLM_MM_INPUT_CACHE_GIB": lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")), diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py new file mode 100644 index 0000000000..262b22e554 --- /dev/null +++ b/vllm/multimodal/cache.py @@ -0,0 +1,95 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import sys +from collections.abc import Mapping +from dataclasses import dataclass +from typing import TypeVar, Union + +import torch + +from vllm.jsontree import json_map_leaves, json_reduce_leaves +from vllm.logger import init_logger +from vllm.utils import GiB_bytes, LRUCache + +from .inputs import MultiModalKwargs, MultiModalKwargsItem, NestedTensors + +logger = init_logger(__name__) + + +@dataclass +class MultiModalCacheItemMetadata: + size: int + + @classmethod + def wraps(cls, value: "MultiModalCacheValue"): + return cls(size=MultiModalCache.get_item_size(value)) + + +MultiModalCacheValue = Union[ + MultiModalKwargs, + MultiModalKwargsItem, + Mapping[str, NestedTensors], + MultiModalCacheItemMetadata, +] + +_V = TypeVar("_V", bound=MultiModalCacheValue) + + +class MultiModalCache: + + @classmethod + def get_leaf_size( + cls, + leaf: object, + *, + debug: bool = False, + ) -> int: + # MultiModalKwargs is not a subclass of dict + if isinstance(leaf, MultiModalKwargs): + return cls.get_item_size(leaf.data, debug=debug) + + # MultiModalKwargsItem is not a subclass of dict + if isinstance(leaf, MultiModalKwargsItem): + leaf_data = {k: v.data for k, v in leaf.items()} + return cls.get_item_size(leaf_data, debug=debug) + + # sys.getsizeof doesn't work for tensors + if isinstance(leaf, torch.Tensor): + return leaf.nbytes + + if isinstance(leaf, MultiModalCacheItemMetadata): + return leaf.size + + return sys.getsizeof(leaf) + + @classmethod + def get_item_size( + cls, + value: MultiModalCacheValue, + *, + debug: bool = False, + ) -> int: + size = json_reduce_leaves( + lambda a, b: a + b, + json_map_leaves(lambda x: cls.get_leaf_size(x, debug=debug), + value), + ) + + if debug: + logger.debug("Calculated size of %s to be %.2f GiB", type(value), + size / GiB_bytes) + + return size + + @classmethod + def get_lru_cache( + cls, + capacity_gb: float, + value_type: type[_V], + *, + debug: bool = False, + ) -> LRUCache[str, _V]: + return LRUCache( + GiB_bytes * capacity_gb, + getsizeof=lambda x: cls.get_item_size(x, debug=debug), + ) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 46240855d1..0378539495 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import sys from abc import ABC, abstractmethod from collections import defaultdict from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping, @@ -16,16 +15,16 @@ import torch from typing_extensions import assert_never from vllm.inputs import InputProcessingContext -from vllm.jsontree import json_map_leaves, json_reduce_leaves from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens, encode_tokens) -from vllm.utils import GiB_bytes, LRUCache, flatten_2d_lists, full_groupby +from vllm.utils import GiB_bytes, flatten_2d_lists, full_groupby +from .cache import MultiModalCache from .hasher import MultiModalHasher from .inputs import (MultiModalDataDict, MultiModalEncDecInputs, MultiModalFieldConfig, MultiModalInputs, MultiModalKwargs, - MultiModalKwargsItem, NestedTensors, PlaceholderRange) + MultiModalKwargsItem, PlaceholderRange) from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems, MultiModalDataParser) @@ -888,9 +887,6 @@ def find_mm_placeholders( return dict(full_groupby_modality(it)) -_V = TypeVar("_V", bound="Union[MultiModalKwargs, MultiModalKwargsItem]") - - class ProcessingCacheOptionalItem(NamedTuple): key: str value: Optional[MultiModalKwargsItem] @@ -901,48 +897,7 @@ class ProcessingCacheItem(NamedTuple): value: MultiModalKwargsItem -class ProcessingCache: - - @staticmethod - def get_lru_cache( - capacity_gb: float, - value_type: type[_V], - *, - debug: bool = False, - ) -> LRUCache[str, _V]: - - def get_leaf_size(leaf: object) -> int: - # MultiModalKwargs is not a subclass of dict - if isinstance(leaf, MultiModalKwargs): - return get_item_size(leaf.data) - - # MultiModalKwargsItem is not a subclass of dict - if isinstance(leaf, MultiModalKwargsItem): - leaf_data = {k: v.data for k, v in leaf.items()} - return get_item_size(leaf_data) - - # sys.getsizeof doesn't work for tensors - if isinstance(leaf, torch.Tensor): - return leaf.nbytes - - return sys.getsizeof(leaf) - - def get_item_size( - value: Union[MultiModalKwargs, MultiModalKwargsItem, - Mapping[str, NestedTensors]] - ) -> int: - size = json_reduce_leaves( - lambda a, b: a + b, - json_map_leaves(get_leaf_size, value), - ) - - if debug: - logger.debug("Calculated size of %s to be %.2f GiB", - type(value), size / GiB_bytes) - - return size - - return LRUCache(GiB_bytes * capacity_gb, getsizeof=get_item_size) +class ProcessingCache(MultiModalCache): def __init__( self, diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index eab1560b1a..38b1d9b13f 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -429,8 +429,8 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int, if mm_positions and len(mm_positions) != len(mm_hashes): raise ValueError( "The number of multi-modal positions and hashes must match. This " - "is likely because you do not enable MM preprocessor hashing. " - "Please set disable_mm_preprocessor_cache=False.") + "is likely because you did not enable MM hashing. " + "Please set `disable_mm_preprocessor_cache=False`.") # Note that we assume mm_positions is sorted by offset. # We do not need to check all mm inputs if the start token index is out of diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 79c47e1028..78b8fe4ea6 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -35,7 +35,7 @@ from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, EngineCoreRequestType, ReconfigureDistributedRequest, ReconfigureRankType, UtilityOutput, UtilityResult) -from vllm.v1.engine.mm_input_cache import MirroredProcessingCache +from vllm.v1.engine.mm_input_cache import MultiModalInputCacheServer from vllm.v1.engine.utils import EngineHandshakeMetadata, EngineZmqAddresses from vllm.v1.executor.abstract import Executor from vllm.v1.kv_cache_interface import KVCacheConfig @@ -124,8 +124,7 @@ class EngineCore: log_stats=self.log_stats, ) - # Setup MM Input Mapper. - self.mm_input_cache_server = MirroredProcessingCache( + self.mm_input_cache_server = MultiModalInputCacheServer( vllm_config.model_config) # Setup batch queue for pipeline parallelism. @@ -413,7 +412,7 @@ class EngineCore: # Note on thread safety: no race condition. # `mm_input_cache_server` is reset at the end of LLMEngine init, # and will only accessed in the input processing thread afterwards. - request.mm_inputs = self.mm_input_cache_server.get_and_update_p1( + request.mm_inputs = self.mm_input_cache_server.get_and_update( request.mm_inputs, request.mm_hashes) req = Request.from_engine_core_request(request) diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index abe98a13df..279c9f0007 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -1,54 +1,68 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence -from typing import Optional +from typing import TYPE_CHECKING, Optional -from vllm.envs import VLLM_MM_INPUT_CACHE_GIB from vllm.multimodal import MultiModalKwargs -from vllm.multimodal.processing import ProcessingCache +from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata from vllm.utils import is_list_of -# The idea of multimodal preprocessing caching is based on having a client and +if TYPE_CHECKING: + from vllm.config import ModelConfig + +# The idea of multimodal input caching is based on having a client and # a server, where the client executes in the frontend process (=P0) and the # server in the core process (=P1). # -# -- Client: -# - BaseMultiModalProcessor to process MultiModalData into MultiModalKwargs -# with built-in caching functionality, with mm_hash as its identifier. -# - MirroredProcessingCache to keep track of the cached entries and -# determine whether to send the MultiModalKwargs to P1. +# -- P0: +# - BaseMultiModalProcessor calls MultiModalHasher to get the `mm_hash` of +# each input multi-modal item (e.g. image), +# - BaseMultiModalProcessor processes the input items into `mm_inputs`, +# which are MultiModalKwargsItem instances that each correspond to an +# input multi-modal item. +# - MultiModalInputCacheClient accepts the `mm_inputs` and corresponding +# `mm_hash` for each item. It stores the `mm_hash` as keys and the size +# of `mm_inputs`, but not the `mm_inputs` themselves, to avoid taking +# up additional memory in P0. +# - The `mm_hash` is always sent to P1. +# - The corresponding `mm_inputs` are only sent to P1 if they are not cached +# in MultiModalInputCacheServer. # -# -- Server: -# - MirroredProcessingCache to store the MultiModalKwargs from P0. +# -- P1: +# - If the `mm_hash` is cached (i.e. `mm_inputs` are not sent from P0), +# MultiModalInputCacheServer retrieves the corresponding `mm_inputs`. +# - If the `mm_hash` is not cached (i.e. `mm_inputs` are sent from P0), +# MultiModalInputCacheServer stores `mm_inputs` under the key `mm_hash`. +# - Either way, the `mm_hash` and corresponding `mm_inputs` are sent to +# the engine for model execution. # -# The caching for both client and server is mirrored, and this allows us -# to avoid the serialization of "mm_inputs" (like pixel values) between -# client (=P0) and server (=P1) processes if the mm_hash is found in the client -# cache. - -# Both Client and Server must use the same cache size -# (to perform mirrored caching). This cache size is set by the environment -# variable VLLM_MM_INPUT_CACHE_GIB. +# Both Client and Server must perform cache update and eviction based on the +# same item size. This ensures that the keys of MultiModalInputCacheClient +# and MultiModalInputCacheServer are mirrored, allowing us to determine in P0 +# whether a key is cached in MultiModalInputCacheServer by querying +# MultiModalInputCacheClient without having to communicate with P1. -class MirroredProcessingCache: +class MultiModalInputCacheClient: + """Used by P0 to check whether multi-modal kwargs are cached in P1.""" - def __init__(self, model_config): - mm_config = model_config.multimodal_config - disable_mm_preprocessor_cache = ( - mm_config is not None and mm_config.disable_mm_preprocessor_cache) - self.use_cache = not disable_mm_preprocessor_cache - self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB, - MultiModalKwargs) + def __init__(self, model_config: "ModelConfig") -> None: + super().__init__() - def get_and_update_p0( + self.enabled = model_config.enable_mm_input_cache + self.mm_cache = MultiModalCache.get_lru_cache( + model_config.get_mm_input_cache_gb(), + MultiModalCacheItemMetadata, + ) + + def get_and_update( self, mm_inputs: Sequence[MultiModalKwargs], mm_hashes: list[str], ) -> Sequence[Optional[MultiModalKwargs]]: assert len(mm_inputs) == len(mm_hashes) - if not self.use_cache: + if not self.enabled: assert is_list_of(mm_inputs, MultiModalKwargs) return mm_inputs @@ -57,20 +71,37 @@ class MirroredProcessingCache: if self.mm_cache.get(mm_hash) is not None: mm_input = None else: - self.mm_cache[mm_hash] = mm_input + self.mm_cache[mm_hash] = \ + MultiModalCacheItemMetadata.wraps(mm_input) full_mm_inputs.append(mm_input) return full_mm_inputs - def get_and_update_p1( + def reset(self) -> None: + self.mm_cache.clear() + + +class MultiModalInputCacheServer: + """Used by P1 to avoid requiring past multi-modal kwargs from P0.""" + + def __init__(self, model_config: "ModelConfig") -> None: + super().__init__() + + self.enabled = model_config.enable_mm_input_cache + self.mm_cache = MultiModalCache.get_lru_cache( + model_config.get_mm_input_cache_gb(), + MultiModalKwargs, + ) + + def get_and_update( self, mm_inputs: Sequence[Optional[MultiModalKwargs]], mm_hashes: list[str], ) -> Sequence[MultiModalKwargs]: assert len(mm_inputs) == len(mm_hashes) - if not self.use_cache: + if not self.enabled: assert is_list_of(mm_inputs, MultiModalKwargs) return mm_inputs @@ -85,7 +116,5 @@ class MirroredProcessingCache: return full_mm_inputs - def reset(self) -> bool: + def reset(self) -> None: self.mm_cache.clear() - - return True diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 692a7dd564..6e37ebeb87 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -19,7 +19,7 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import TokenizerGroup from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine.mm_input_cache import MirroredProcessingCache +from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient from vllm.v1.structured_output.backend_guidance import ( validate_guidance_grammar) from vllm.v1.structured_output.backend_outlines import ( @@ -50,11 +50,8 @@ class Processor: self.tokenizer, mm_registry) - self.mm_input_cache_client = MirroredProcessingCache(self.model_config) - - # Multi-modal hasher (for images) - self.use_hash = self.mm_input_cache_client.use_cache or \ - self.cache_config.enable_prefix_caching + self.mm_input_cache_client = MultiModalInputCacheClient( + self.model_config) @property def mm_registry(self): @@ -256,11 +253,13 @@ class Processor: # 1. Tokenize text prompt, with LoRA request if one exists. # 2. For multimodal models with a merged preprocessor, preprocess # multimodal data and expand prompt token ids accordingly. + return_mm_hashes = (self.model_config.processor_return_mm_hashes + or bool(self.cache_config.enable_prefix_caching)) processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess( prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - return_mm_hashes=self.use_hash, + return_mm_hashes=return_mm_hashes, ) from vllm.platforms import current_platform current_platform.validate_request( @@ -312,7 +311,7 @@ class Processor: sorted_mm_hashes, ) = merge_and_sort_multimodal_metadata( decoder_inputs["mm_placeholders"], - decoder_inputs["mm_hashes"] if self.use_hash else None, + decoder_inputs["mm_hashes"] if return_mm_hashes else None, ) # The output of merged multi-modal processor (`decoder_mm_inputs`) @@ -339,7 +338,7 @@ class Processor: ] if sorted_mm_hashes is not None: - sorted_mm_inputs = self.mm_input_cache_client.get_and_update_p0( + sorted_mm_inputs = self.mm_input_cache_client.get_and_update( orig_sorted_mm_inputs, sorted_mm_hashes) else: sorted_mm_inputs = orig_sorted_mm_inputs From 7e0b121812a30975365497e39608898312c18984 Mon Sep 17 00:00:00 2001 From: fxmarty-amd Date: Thu, 7 Aug 2025 15:30:48 +0200 Subject: [PATCH 065/932] [Bugfix] Add missing `packed_modules_mapping` to `DeepseekV2ForCausalLM` (#22352) Signed-off-by: Felix Marty --- vllm/model_executor/models/deepseek_v2.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 68a0a83d62..c2880c33cb 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -726,6 +726,9 @@ class DeepseekV2Model(nn.Module): class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts): + packed_modules_mapping = { + "gate_up_proj": ["gate_proj", "up_proj"], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -733,6 +736,19 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts): quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config + + # `packed_modules_mapping` needs to be modified before + # initializing DeepseekV2Model, as it is passed inplace to + # quantization config init and may be used to select the + # quant_method for relevant layers during initialization. + self.fuse_qkv_a_proj = hasattr( + config, "q_lora_rank") and config.q_lora_rank is not None + if self.fuse_qkv_a_proj: + self.packed_modules_mapping["fused_qkv_a_proj"] = [ + "q_a_proj", + "kv_a_proj_with_mqa", + ] + self.model = DeepseekV2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) if get_pp_group().is_last_rank: From 4da8bf20d08f1f8f97a4839d580eb923d0ca9415 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Thu, 7 Aug 2025 07:03:38 -0700 Subject: [PATCH 066/932] [Tool] Fix auto tool call (#22434) Signed-off-by: Chen Zhang --- vllm/entrypoints/openai/serving_responses.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index f26f92537c..21fc209af9 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -132,9 +132,6 @@ class OpenAIServingResponses(OpenAIServing): "\"auto\" tool choice has been enabled please note that while" " the parallel_tool_calls client option is preset for " "compatibility reasons, it will be ignored.") - if not self.use_harmony: - raise NotImplementedError("Auto tool choice is not supported " - "yet unless using Harmony") # HACK(woosuk): This is a hack. We should use a better store. # FIXME: If enable_store=True, this may cause a memory leak since we @@ -212,8 +209,8 @@ class OpenAIServingResponses(OpenAIServing): await self._make_request(request, prev_response, tokenizer)) - except (ValueError, TypeError, RuntimeError, - jinja2.TemplateError) as e: + except (ValueError, TypeError, RuntimeError, jinja2.TemplateError, + NotImplementedError) as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(f"{e} {e.__cause__}") @@ -313,6 +310,9 @@ class OpenAIServingResponses(OpenAIServing): prev_response: Optional[ResponsesResponse], tokenizer: AnyTokenizer, ): + if len(request.tools) > 0: + raise NotImplementedError( + "Tool use is not supported in Responses API without Harmony") # Construct the input messages. messages = self._construct_input_messages(request, prev_response) _, request_prompts, engine_prompts = await self._preprocess_chat( From 4815b00f5487a070a40c7451c2cfcaef80786220 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Thu, 7 Aug 2025 08:33:25 -0700 Subject: [PATCH 067/932] [gpt-oss] Generate ResponseOutputItem from Harmony Message (#22410) Signed-off-by: Chen Zhang --- .../openai/responses/test_basic.py | 2 +- vllm/entrypoints/harmony_utils.py | 153 ++++++++++++++- vllm/entrypoints/openai/protocol.py | 31 +-- vllm/entrypoints/openai/serving_responses.py | 184 ++++++++++++------ 4 files changed, 290 insertions(+), 80 deletions(-) diff --git a/tests/v1/entrypoints/openai/responses/test_basic.py b/tests/v1/entrypoints/openai/responses/test_basic.py index 974ea8673c..18c35152e7 100644 --- a/tests/v1/entrypoints/openai/responses/test_basic.py +++ b/tests/v1/entrypoints/openai/responses/test_basic.py @@ -17,7 +17,7 @@ async def test_simple_input(client: openai.AsyncOpenAI): # Whether the output contains the reasoning. assert outputs[0].type == "reasoning" - assert outputs[0].text != "" + assert outputs[0].content[0].text != "" @pytest.mark.asyncio diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index ee08d62b57..87e76e08a0 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -1,18 +1,25 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import datetime +import json from collections.abc import Iterable, Sequence from typing import Literal, Optional, Union -from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem +from openai.types.responses import (ResponseFunctionToolCall, + ResponseOutputItem, ResponseOutputMessage, + ResponseOutputText, ResponseReasoningItem) +from openai.types.responses.response_function_web_search import ( + ActionFind, ActionOpenPage, ActionSearch, ResponseFunctionWebSearch) +from openai.types.responses.response_reasoning_item import ( + Content as ResponseReasoningTextContent) from openai.types.responses.tool import Tool from openai_harmony import (Author, Conversation, DeveloperContent, HarmonyEncodingName, Message, ReasoningEffort, Role, StreamableParser, SystemContent, TextContent, ToolDescription, load_harmony_encoding) -from vllm.entrypoints.openai.protocol import (ResponseInputOutputItem, - ResponseReasoningItem) +from vllm.entrypoints.openai.protocol import ResponseInputOutputItem +from vllm.utils import random_uuid REASONING_EFFORT = { "high": ReasoningEffort.HIGH, @@ -160,6 +167,146 @@ def render_for_completion(messages: list[Message]) -> list[int]: return token_ids +def parse_output_message(message: Message) -> list[ResponseOutputItem]: + """ + Parse a Harmony message into a list of output response items. + """ + if message.author.role != "assistant": + # This is a message from a tool to the assistant (e.g., search result). + # Don't include it in the final output for now. This aligns with + # OpenAI's behavior on models like o4-mini. + return [] + + output_items: list[ResponseOutputItem] = [] + recipient = message.recipient + if recipient is not None and recipient.startswith("browser."): + if len(message.content) != 1: + raise ValueError("Invalid number of contents in browser message") + content = message.content[0] + browser_call = json.loads(content.text) + # TODO: translate to url properly! + if recipient == "browser.search": + action = ActionSearch( + query=f"cursor:{browser_call.get('query', '')}", type="search") + elif recipient == "browser.open": + action = ActionOpenPage( + url=f"cursor:{browser_call.get('url', '')}", type="open_page") + elif recipient == "browser.find": + action = ActionFind(pattern=browser_call["pattern"], + url=f"cursor:{browser_call.get('url', '')}", + type="find") + else: + raise ValueError(f"Unknown browser action: {recipient}") + web_search_item = ResponseFunctionWebSearch( + id=f"ws_{random_uuid()}", + action=action, + status="completed", + type="web_search_call", + ) + output_items.append(web_search_item) + elif message.channel == "analysis": + for content in message.content: + reasoning_item = ResponseReasoningItem( + id=f"rs_{random_uuid()}", + summary=[], + type="reasoning", + content=[ + ResponseReasoningTextContent(text=content.text, + type="reasoning_text") + ], + status=None, + ) + output_items.append(reasoning_item) + elif message.channel == "commentary": + if message.recipient.startswith("functions."): + function_name = message.recipient.split(".")[-1] + for content in message.content: + random_id = random_uuid() + response_item = ResponseFunctionToolCall( + arguments=content.text, + call_id=f"call_{random_id}", + type="function_call", + name=function_name, + id=f"ft_{random_id}", + ) + output_items.append(response_item) + elif message.recipient.startswith( + "python") or message.recipient.startswith("browser"): + for content in message.content: + reasoning_item = ResponseReasoningItem( + id=f"rs_{random_uuid()}", + summary=[], + type="reasoning", + text=content.text, + status=None, + ) + output_items.append(reasoning_item) + else: + raise ValueError(f"Unknown recipient: {message.recipient}") + elif message.channel == "final": + contents = [] + for content in message.content: + output_text = ResponseOutputText( + text=content.text, + annotations=[], # TODO + type="output_text", + logprobs=None, # TODO + ) + contents.append(output_text) + text_item = ResponseOutputMessage( + id=f"msg_{random_uuid()}", + content=contents, + role=message.author.role, + status="completed", + type="message", + ) + output_items.append(text_item) + else: + raise ValueError(f"Unknown channel: {message.channel}") + return output_items + + +def parse_remaining_state( + parser: StreamableParser) -> list[ResponseOutputItem]: + if not parser.current_content: + return [] + if parser.current_role != Role.ASSISTANT: + return [] + current_recipient = parser.current_recipient + if (current_recipient is not None + and current_recipient.startswith("browser.")): + return [] + + if parser.current_channel == "analysis": + reasoning_item = ResponseReasoningItem( + id=f"rs_{random_uuid()}", + summary=[], + type="reasoning", + content=[ + ResponseReasoningTextContent(text=parser.current_content, + type="reasoning_text") + ], + status=None, + ) + return [reasoning_item] + elif parser.current_channel == "final": + output_text = ResponseOutputText( + text=parser.current_content, + annotations=[], # TODO + type="output_text", + logprobs=None, # TODO + ) + text_item = ResponseOutputMessage( + id=f"msg_{random_uuid()}", + content=[output_text], + role="assistant", + status="completed", + type="message", + ) + return [text_item] + return [] + + def get_stop_tokens_for_assistant_actions() -> list[int]: return get_encoding().stop_tokens_for_assistant_actions() diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index ea2cf57563..3b9f4b544e 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -19,8 +19,8 @@ from openai.types.chat.chat_completion_message import ( # yapf: enable from openai.types.responses import (ResponseFunctionToolCall, ResponseInputItemParam, ResponseOutputItem, - ResponseOutputMessage, ResponsePrompt, - ResponseStatus, ResponseTextConfig) + ResponsePrompt, ResponseStatus, + ResponseTextConfig) from openai.types.responses.response import ToolChoice from openai.types.responses.tool import Tool from openai.types.shared import Metadata, Reasoning @@ -1729,13 +1729,20 @@ class TranscriptionStreamResponse(OpenAIBaseModel): usage: Optional[UsageInfo] = Field(default=None) -class ResponseReasoningItem(OpenAIBaseModel): - id: str = Field(default_factory=lambda: f"rs_{random_uuid()}") - text: str - summary: list = Field(default_factory=list) - type: Literal["reasoning"] = "reasoning" - encrypted_content: Optional[str] = None - status: Optional[Literal["in_progress", "completed", "incomplete"]] +class InputTokensDetails(OpenAIBaseModel): + cached_tokens: int + + +class OutputTokensDetails(OpenAIBaseModel): + reasoning_tokens: int + + +class ResponseUsage(OpenAIBaseModel): + input_tokens: int + input_tokens_details: InputTokensDetails + output_tokens: int + output_tokens_details: OutputTokensDetails + total_tokens: int class ResponsesResponse(OpenAIBaseModel): @@ -1747,7 +1754,7 @@ class ResponsesResponse(OpenAIBaseModel): metadata: Optional[Metadata] = None model: str object: Literal["response"] = "response" - output: list[Union[ResponseOutputMessage, ResponseReasoningItem]] + output: list[ResponseOutputItem] parallel_tool_calls: bool temperature: float tool_choice: ToolChoice @@ -1764,7 +1771,7 @@ class ResponsesResponse(OpenAIBaseModel): text: Optional[ResponseTextConfig] = None top_logprobs: int truncation: Literal["auto", "disabled"] - usage: Optional[UsageInfo] = None + usage: Optional[ResponseUsage] = None user: Optional[str] = None @classmethod @@ -1776,7 +1783,7 @@ class ResponsesResponse(OpenAIBaseModel): created_time: int, output: list[ResponseOutputItem], status: ResponseStatus, - usage: Optional[UsageInfo] = None, + usage: Optional[ResponseUsage] = None, ) -> "ResponsesResponse": return cls( id=request.request_id, diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 21fc209af9..d40231795b 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -6,12 +6,15 @@ import time from collections.abc import AsyncGenerator, AsyncIterator from copy import copy from http import HTTPStatus -from typing import Callable, Final, Optional, Union +from typing import Any, Callable, Final, Optional, Union import jinja2 from fastapi import Request from openai.types.responses import (ResponseFunctionToolCall, - ResponseOutputMessage, ResponseOutputText) + ResponseOutputItem, ResponseOutputMessage, + ResponseOutputText, ResponseReasoningItem) +from openai.types.responses.response_reasoning_item import ( + Content as ResponseReasoningTextContent) from openai_harmony import Message as OpenAIHarmonyMessage from vllm import envs @@ -19,26 +22,28 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, ChatTemplateContentFormatOption) -from vllm.entrypoints.context import ConversationContext, SimpleContext +from vllm.entrypoints.context import (ConversationContext, HarmonyContext, + SimpleContext, StreamingHarmonyContext) from vllm.entrypoints.harmony_utils import ( get_developer_message, get_stop_tokens_for_assistant_actions, - get_system_message, get_user_message, parse_response_input, - render_for_completion) + get_system_message, get_user_message, parse_output_message, + parse_remaining_state, parse_response_input, render_for_completion) from vllm.entrypoints.logger import RequestLogger # yapf conflicts with isort for this block # yapf: disable from vllm.entrypoints.openai.protocol import (ErrorResponse, - PromptTokenUsageInfo, + InputTokensDetails, + OutputTokensDetails, RequestResponseMetadata, - ResponseReasoningItem, ResponsesRequest, - ResponsesResponse, UsageInfo) + ResponsesResponse, ResponseUsage) # yapf: enable from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.tool_server import ToolServer from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.logger import init_logger +from vllm.outputs import CompletionOutput from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -222,6 +227,7 @@ class OpenAIServingResponses(OpenAIServing): # Schedule the request and get the result generator. generators: list[AsyncGenerator[ConversationContext, None]] = [] try: + tool_sessions: dict[str, Any] = {} for i, engine_prompt in enumerate(engine_prompts): default_max_tokens = self.max_model_len - len( engine_prompt["prompt_token_ids"]) @@ -231,7 +237,15 @@ class OpenAIServingResponses(OpenAIServing): trace_headers = (None if raw_request is None else await self._get_trace_headers(raw_request.headers)) - context = SimpleContext() + context: ConversationContext + if self.use_harmony: + if request.stream: + context = StreamingHarmonyContext( + messages, tool_sessions) + else: + context = HarmonyContext(messages, tool_sessions) + else: + context = SimpleContext() generator = self._generate_with_builtin_tools( request_id=request.request_id, request_prompt=request_prompts[i], @@ -274,6 +288,7 @@ class OpenAIServingResponses(OpenAIServing): request, sampling_params, result_generator, + context, model_name, tokenizer, request_metadata, @@ -297,6 +312,7 @@ class OpenAIServingResponses(OpenAIServing): request, sampling_params, result_generator, + context, model_name, tokenizer, request_metadata, @@ -344,6 +360,7 @@ class OpenAIServingResponses(OpenAIServing): request: ResponsesRequest, sampling_params: SamplingParams, result_generator: AsyncIterator[ConversationContext], + context: ConversationContext, model_name: str, tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, @@ -352,9 +369,8 @@ class OpenAIServingResponses(OpenAIServing): if created_time is None: created_time = int(time.time()) - context: Optional[ConversationContext] = None try: - async for context in result_generator: + async for _ in result_generator: pass except asyncio.CancelledError: return self.create_error_response("Client disconnected") @@ -362,64 +378,40 @@ class OpenAIServingResponses(OpenAIServing): # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) - assert context is not None - assert isinstance(context, SimpleContext) - final_res = context.last_output - assert final_res is not None - assert len(final_res.outputs) == 1 - final_output = final_res.outputs[0] - - if self.reasoning_parser: - try: - reasoning_parser = self.reasoning_parser(tokenizer) - except RuntimeError as e: - logger.exception("Error in reasoning parser creation.") - return self.create_error_response(str(e)) - - reasoning_content, content = ( - reasoning_parser.extract_reasoning_content(final_output.text, - request=request)) + if self.use_harmony: + assert isinstance(context, HarmonyContext) + output = self._make_response_output_items_with_harmony(context) + # TODO: these are all 0 for now! + num_prompt_tokens = context.num_prompt_tokens + num_generated_tokens = context.num_output_tokens + num_cached_tokens = context.num_cached_tokens + num_reasoning_tokens = context.num_reasoning_tokens else: - reasoning_content = None - content = final_output.text + assert isinstance(context, SimpleContext) + final_res = context.last_output + assert final_res is not None + assert len(final_res.outputs) == 1 + final_output = final_res.outputs[0] - output = [] - if reasoning_content: - reasoning_item = ResponseReasoningItem( - text=reasoning_content, - status=None, # NOTE: Only the last output item has status. - ) - output.append(reasoning_item) - if content: - output_text = ResponseOutputText( - text=content, - annotations=[], # TODO - type="output_text", - logprobs=None, # TODO - ) - message = ResponseOutputMessage( - id=f"msg_{random_uuid()}", - content=[output_text], - role="assistant", - status="completed", - type="message", - ) - output.append(message) + output = self._make_response_output_items(request, final_output, + tokenizer) - # Calculate usage. - assert final_res.prompt_token_ids is not None - num_prompt_tokens = len(final_res.prompt_token_ids) - num_generated_tokens = len(final_output.token_ids) - usage = UsageInfo( - prompt_tokens=num_prompt_tokens, - completion_tokens=num_generated_tokens, + # Calculate usage. + assert final_res.prompt_token_ids is not None + num_prompt_tokens = len(final_res.prompt_token_ids) + num_generated_tokens = len(final_output.token_ids) + num_cached_tokens = final_res.num_cached_tokens + num_reasoning_tokens = 0 + + usage = ResponseUsage( + input_tokens=num_prompt_tokens, + output_tokens=num_generated_tokens, total_tokens=num_prompt_tokens + num_generated_tokens, + input_tokens_details=InputTokensDetails( + cached_tokens=num_cached_tokens), + output_tokens_details=OutputTokensDetails( + reasoning_tokens=num_reasoning_tokens), ) - if self.enable_prompt_tokens_details and final_res.num_cached_tokens: - usage.prompt_tokens_details = PromptTokenUsageInfo( - cached_tokens=final_res.num_cached_tokens) - request_metadata.final_usage_info = usage - response = ResponsesResponse.from_request( request, sampling_params, @@ -457,6 +449,70 @@ class OpenAIServingResponses(OpenAIServing): self.response_store[response.id] = response return response + def _make_response_output_items( + self, + request: ResponsesRequest, + final_output: CompletionOutput, + tokenizer: AnyTokenizer, + ) -> list[ResponseOutputItem]: + if self.reasoning_parser: + try: + reasoning_parser = self.reasoning_parser(tokenizer) + except RuntimeError as e: + logger.exception("Error in reasoning parser creation.") + raise e + + reasoning_content, content = ( + reasoning_parser.extract_reasoning_content(final_output.text, + request=request)) + else: + reasoning_content = None + content = final_output.text + + output = [] + if reasoning_content: + reasoning_item = ResponseReasoningItem( + id=f"rs_{random_uuid()}", + summary=[], + type="reasoning", + content=[ + ResponseReasoningTextContent(text=reasoning_content, + type="reasoning_text") + ], + status=None, # NOTE: Only the last output item has status. + ) + output.append(reasoning_item) + if content: + output_text = ResponseOutputText( + text=content, + annotations=[], # TODO + type="output_text", + logprobs=None, # TODO + ) + message = ResponseOutputMessage( + id=f"msg_{random_uuid()}", + content=[output_text], + role="assistant", + status="completed", + type="message", + ) + output.append(message) + return output + + def _make_response_output_items_with_harmony( + self, + context: HarmonyContext, + ) -> list[ResponseOutputItem]: + output_items = [] + num_init_messages = context.num_init_messages + for msg in context.messages[num_init_messages:]: + output_items.extend(parse_output_message(msg)) + # Handle the generation stopped in the middle (if any). + last_items = parse_remaining_state(context.parser) + if last_items: + output_items.extend(last_items) + return output_items + def _construct_input_messages( self, request: ResponsesRequest, From 399d2a10e23fcf37cc7a703d7de50ffecc7e0c6f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 7 Aug 2025 08:54:39 -0700 Subject: [PATCH 068/932] Fix pre-commit error in main (#22462) Signed-off-by: Woosuk Kwon --- vllm/entrypoints/openai/serving_responses.py | 36 ++++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index d40231795b..a7554e0d68 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -422,24 +422,6 @@ class OpenAIServingResponses(OpenAIServing): usage=usage, ) - # Log complete response if output logging is enabled - if self.enable_log_outputs and self.request_logger: - output_text = "" - if content: - output_text = content - elif reasoning_content: - output_text = f"[reasoning: {reasoning_content}]" - - if output_text: - self.request_logger.log_outputs( - request_id=request.request_id, - outputs=output_text, - output_token_ids=final_output.token_ids, - finish_reason=final_output.finish_reason, - is_streaming=False, - delta=False, - ) - if request.store: async with self.response_store_lock: stored_response = self.response_store.get(response.id) @@ -469,6 +451,24 @@ class OpenAIServingResponses(OpenAIServing): reasoning_content = None content = final_output.text + # Log complete response if output logging is enabled + if self.enable_log_outputs and self.request_logger: + output_text = "" + if content: + output_text = content + elif reasoning_content: + output_text = f"[reasoning: {reasoning_content}]" + + if output_text: + self.request_logger.log_outputs( + request_id=request.request_id, + outputs=output_text, + output_token_ids=final_output.token_ids, + finish_reason=final_output.finish_reason, + is_streaming=False, + delta=False, + ) + output = [] if reasoning_content: reasoning_item = ResponseReasoningItem( From 8c9da6be229336a769d9c904415daaa250824c89 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 8 Aug 2025 00:47:07 +0800 Subject: [PATCH 069/932] [Core] Simplify mm processing cache (#22457) Signed-off-by: DarkLight1337 --- .../models/qwen2_5_omni_thinker.py | 12 +- vllm/model_executor/models/transformers.py | 5 +- vllm/multimodal/processing.py | 248 +++++------------- vllm/v1/serial_utils.py | 34 +-- 4 files changed, 95 insertions(+), 204 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index b9fed79c84..a3af541d20 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -431,7 +431,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor( tokenization_kwargs: Mapping[str, object], *, enable_hf_prompt_update: bool, - ) -> tuple[list[int], MultiModalKwargs, bool]: + ) -> tuple[list[int], BatchFeature, bool]: """ Qwen2.5-Omni reimplements this function to handle text only. """ @@ -448,20 +448,20 @@ class Qwen2_5OmniThinkerMultiModalProcessor( else: prompt_ids = self._apply_hf_processor_tokens_only(prompt) - mm_kwargs = self._apply_hf_processor_mm_only( + mm_processed_data = self._apply_hf_processor_mm_only( mm_items=mm_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, ) - return prompt_ids, mm_kwargs, False + return prompt_ids, mm_processed_data, False def _apply_hf_processor_mm_only( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - ) -> MultiModalKwargs: + ) -> BatchFeature: """ Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`. """ @@ -473,14 +473,14 @@ class Qwen2_5OmniThinkerMultiModalProcessor( assert "audio" in mm_counts mm_counts["audio"] -= mm_counts["video"] - _, mm_kwargs, _ = self._apply_hf_processor_text_mm( + _, mm_processed_data, _ = self._apply_hf_processor_text_mm( prompt_text=self.dummy_inputs.get_dummy_text(mm_counts), mm_items=mm_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, ) - return mm_kwargs + return mm_processed_data def _validate_mm_placeholders( self, diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 0c3df267ed..92e132045c 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -22,7 +22,8 @@ from typing import Literal, Optional, Union import regex as re import torch from torch import nn -from transformers import AutoModel, PretrainedConfig, PreTrainedModel +from transformers import (AutoModel, BatchFeature, PretrainedConfig, + PreTrainedModel) from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS from vllm.attention import Attention @@ -269,7 +270,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]): mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - ): + ) -> tuple[list[int], BatchFeature, bool]: """ Apply the HF processor on the prompt text and multi-modal data together. diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 0378539495..38c5d5d99f 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -18,7 +18,7 @@ from vllm.inputs import InputProcessingContext from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens, encode_tokens) -from vllm.utils import GiB_bytes, flatten_2d_lists, full_groupby +from vllm.utils import flatten_2d_lists, full_groupby from .cache import MultiModalCache from .hasher import MultiModalHasher @@ -887,120 +887,19 @@ def find_mm_placeholders( return dict(full_groupby_modality(it)) -class ProcessingCacheOptionalItem(NamedTuple): - key: str - value: Optional[MultiModalKwargsItem] - - -class ProcessingCacheItem(NamedTuple): - key: str - value: MultiModalKwargsItem - - class ProcessingCache(MultiModalCache): - def __init__( - self, - capacity_gb: float, - *, - debug_cache_hit_ratio_steps: Optional[int] = None, - ) -> None: + def __init__(self, capacity_gb: float) -> None: super().__init__() - self.debug_cache_hit_ratio_steps = debug_cache_hit_ratio_steps - self.debug_cache_hits = 0 - self.debug_cache_total = 0 + self._cache = self.get_lru_cache(capacity_gb, MultiModalKwargsItem) - self._cache = self.get_lru_cache( - capacity_gb, - MultiModalKwargsItem, - debug=bool(debug_cache_hit_ratio_steps), - ) + self.get = self._cache.get + self.put = self._cache.put + self.reset = self._cache.clear - def _maybe_log_cache_stats(self) -> None: - steps = self.debug_cache_hit_ratio_steps - if not steps: - return - total = self.debug_cache_total - if total > 0 and total % steps == 0: - logger.debug("ProcessingCache: hit_ratio = %.2f", - self.debug_cache_hits / total) - logger.debug("ProcessingCache: size = %.2f / %.2f GiB", - self._cache.currsize / GiB_bytes, - self._cache.maxsize / GiB_bytes) - - def get( - self, - model_id: str, - modality: str, - input_item: object, - input_kwargs: Mapping[str, object], - ) -> Optional[MultiModalKwargsItem]: - """ - Get a processed multi-modal item from the cache - according to its dependencies, including: - - - The model ID - - The modality of the item - - The original data item passed to the HF processor - - The configuration options of the HF processor - """ - self._maybe_log_cache_stats() - - cache_key = MultiModalHasher.hash_kwargs(model_id=model_id, - **{modality: input_item}, - **input_kwargs) - - if self.debug_cache_hit_ratio_steps: - if cache_key in self._cache: - self.debug_cache_hits += 1 - - self.debug_cache_total += 1 - - return self._cache.get(cache_key) - - def get_item( - self, - model_id: str, - modality: str, - input_item: object, - input_kwargs: Mapping[str, object], - ) -> ProcessingCacheOptionalItem: - cache_key = MultiModalHasher.hash_kwargs(model_id=model_id, - **{modality: input_item}, - **input_kwargs) - - return ProcessingCacheOptionalItem( - key=cache_key, - value=self._cache.get(cache_key), - ) - - def put( - self, - model_id: str, - modality: str, - input_item: object, - input_kwargs: Mapping[str, object], - output_kwargs: MultiModalKwargsItem, - ) -> None: - """ - Put a processed multi-modal item into the cache - according to its dependencies - (see [`get`][vllm.multimodal.processing.ProcessingCache.get]). - """ - cache_key = MultiModalHasher.hash_kwargs(model_id=model_id, - **{modality: input_item}, - **input_kwargs) - self._cache[cache_key] = output_kwargs - - def put_item(self, item: ProcessingCacheItem) -> None: - self._cache[item.key] = item.value - - def reset(self) -> bool: - self._cache.clear() - - return True +_CacheItemOrHash = Union[MultiModalKwargsItem, str] class BaseProcessingInfo: @@ -1279,7 +1178,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - ) -> tuple[list[int], MultiModalKwargs, bool]: + ) -> tuple[list[int], "BatchFeature", bool]: """ Apply the HF processor on the prompt text and multi-modal data together. @@ -1298,11 +1197,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): prompt_ids, = processed_data.pop("input_ids").tolist() - mm_kwargs = MultiModalKwargs.from_hf_inputs( - processed_data, - self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs), - ) - is_update_applied = self._hf_processor_applies_updates( prompt_text=prompt_text, mm_items=mm_items, @@ -1310,11 +1204,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): tokenization_kwargs=tokenization_kwargs, ) - return prompt_ids, mm_kwargs, is_update_applied + return prompt_ids, processed_data, is_update_applied def _apply_hf_processor_text_only( - self, prompt_text: str, - tokenization_kwargs: Mapping[str, object]) -> list[int]: + self, + prompt_text: str, + tokenization_kwargs: Mapping[str, object], + ) -> list[int]: """ Apply the HF processor on the prompt text only. @@ -1353,7 +1249,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - ) -> MultiModalKwargs: + ) -> "BatchFeature": """ Apply the HF processor on the multi-modal data only. @@ -1364,14 +1260,14 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): """ mm_counts = mm_items.get_all_counts() - _, mm_kwargs, _ = self._apply_hf_processor_text_mm( + _, mm_processed_data, _ = self._apply_hf_processor_text_mm( prompt_text=self.dummy_inputs.get_dummy_text(mm_counts), mm_items=mm_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, ) - return mm_kwargs + return mm_processed_data def _apply_hf_processor_main( self, @@ -1381,7 +1277,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): tokenization_kwargs: Mapping[str, object], *, enable_hf_prompt_update: bool, - ) -> tuple[list[int], MultiModalKwargs, bool]: + ) -> tuple[list[int], "BatchFeature", bool]: """ Apply the HF processor on the prompt text and multi-modal data. @@ -1407,52 +1303,46 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): else: prompt_ids = self._apply_hf_processor_tokens_only(prompt) - mm_kwargs = self._apply_hf_processor_mm_only( + mm_processed_data = self._apply_hf_processor_mm_only( mm_items=mm_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, ) - return prompt_ids, mm_kwargs, False + return prompt_ids, mm_processed_data, False def _get_cache_missing_items( self, cache: ProcessingCache, mm_data_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - tokenization_kwargs: Mapping[str, object], - ) -> tuple[dict[str, list[ProcessingCacheOptionalItem]], dict[ - str, list[object]]]: - model_id = self.info.model_id - - mm_cache_items = { - modality: [ - cache.get_item( - model_id, modality, item, - dict(**hf_processor_mm_kwargs, **tokenization_kwargs)) - for item in items - ] - for modality, items in mm_data_items.items() + mm_hashes: MultiModalHashes, + ) -> tuple[dict[str, list[_CacheItemOrHash]], MultiModalDataItems]: + mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]] = { + modality: [(h if (v := cache.get(h)) is None else v) + for h in hashes] + for modality, hashes in mm_hashes.items() } mm_missing_idxs = { modality: [ - idx for idx, item in enumerate(cache_items) - if item.value is None + idx for idx, item_or_hash in enumerate(items_or_hashes) + if isinstance(item_or_hash, str) ] - for modality, cache_items in mm_cache_items.items() + for modality, items_or_hashes in mm_cache_items_or_hashes.items() } mm_missing_data = { modality: [mm_data_items[modality][idx] for idx in idxs] for modality, idxs in mm_missing_idxs.items() } - return mm_cache_items, mm_missing_data + return mm_cache_items_or_hashes, self._to_mm_items(mm_missing_data) def _hash_mm_items( - self, mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - tokenization_kwargs: Mapping[str, object]) -> MultiModalHashes: + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + tokenization_kwargs: Mapping[str, object], + ) -> MultiModalHashes: """Create MM hashes to be returned (only used in V1).""" model_id = self.info.model_id @@ -1470,34 +1360,25 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): def _merge_mm_kwargs( self, cache: ProcessingCache, - mm_cache_items: dict[str, list[ProcessingCacheOptionalItem]], - mm_missing_data: dict[str, list[object]], + mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]], mm_missing_kwargs: MultiModalKwargs, - ) -> dict[str, list[ProcessingCacheItem]]: - mm_missing_next_idx = {modality: 0 for modality in mm_missing_data} + ) -> dict[str, list[MultiModalKwargsItem]]: + mm_missing_next_idx = defaultdict[str, int](lambda: 0) - merged_items = defaultdict[str, list[ProcessingCacheItem]](list) - for modality, cache_items in mm_cache_items.items(): - for cache_item in cache_items: - if cache_item.value is None: + merged_items = defaultdict[str, list[MultiModalKwargsItem]](list) + for modality, items_or_hashes in mm_cache_items_or_hashes.items(): + for item_or_hash in items_or_hashes: + if isinstance(item_or_hash, str): kw_item = mm_missing_kwargs.get_item( modality, mm_missing_next_idx[modality], ) - cache_item_new = ProcessingCacheItem( - key=cache_item.key, - value=kw_item, - ) - - cache.put_item(cache_item_new) + cache.put(item_or_hash, kw_item) mm_missing_next_idx[modality] += 1 else: - cache_item_new = ProcessingCacheItem( - key=cache_item.key, - value=cache_item.value, - ) + kw_item = item_or_hash - merged_items[modality].append(cache_item_new) + merged_items[modality].append(kw_item) return dict(merged_items) @@ -1512,7 +1393,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: ( prompt_ids, - mm_kwargs, + mm_processed_data, is_update_applied, ) = self._apply_hf_processor_main( prompt=prompt, @@ -1522,6 +1403,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): enable_hf_prompt_update=True, ) + mm_kwargs = MultiModalKwargs.from_hf_inputs( + mm_processed_data, + self._get_mm_fields_config(mm_processed_data, + hf_processor_mm_kwargs), + ) + mm_hashes = (self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs, tokenization_kwargs) if return_mm_hashes else None) @@ -1553,49 +1440,52 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): return_mm_hashes=return_mm_hashes, ) + mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs, + tokenization_kwargs) ( - mm_cache_items, - mm_missing_data, + mm_cache_items_or_hashes, + mm_missing_data_items, ) = self._get_cache_missing_items( cache=cache, mm_data_items=mm_data_items, - hf_processor_mm_kwargs=hf_processor_mm_kwargs, - tokenization_kwargs=tokenization_kwargs, + mm_hashes=mm_hashes, ) + mm_hashes_to_return = mm_hashes if return_mm_hashes else None + # NOTE: `prompt` does not correspond to `mm_missing_data_items`, # so we can't apply prompt updates until the new multimodal # items are combined with the cached multimodal items ( prompt_ids, - mm_missing_kwargs, + mm_missing_processed_data, is_update_applied, ) = self._apply_hf_processor_main( prompt=prompt, - mm_items=self._to_mm_items(mm_missing_data), + mm_items=mm_missing_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, enable_hf_prompt_update=False, ) + mm_missing_kwargs = MultiModalKwargs.from_hf_inputs( + mm_missing_processed_data, + self._get_mm_fields_config(mm_missing_processed_data, + hf_processor_mm_kwargs), + ) + mm_cache_items_merged = self._merge_mm_kwargs( cache, - mm_cache_items=mm_cache_items, - mm_missing_data=mm_missing_data, + mm_cache_items_or_hashes=mm_cache_items_or_hashes, mm_missing_kwargs=mm_missing_kwargs, ) mm_kwargs = MultiModalKwargs.from_items([ - item.value for cache_items in mm_cache_items_merged.values() + item for cache_items in mm_cache_items_merged.values() for item in cache_items ]) - mm_hashes = { - modality: [item.key for item in cache_items] - for modality, cache_items in mm_cache_items_merged.items() - } if return_mm_hashes else None - - return prompt_ids, mm_kwargs, mm_hashes, is_update_applied + return prompt_ids, mm_kwargs, mm_hashes_to_return, is_update_applied def _bind_and_group_updates( self, diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 809a60c196..9d063f1eda 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -312,25 +312,25 @@ class MsgpackDecoder: return arr.view(torch_dtype).view(shape) def _decode_mm_items(self, obj: list) -> list[MultiModalKwargsItem]: - decoded_items = [] - for item in obj: - elems = [] - for v in item: - v["data"] = self._decode_nested_tensors(v["data"]) - # Reconstruct the field processor using MultiModalFieldConfig - factory_meth_name, *field_args = v["field"] - factory_meth = getattr(MultiModalFieldConfig, - factory_meth_name) + return [self._decode_mm_item(v) for v in obj] - # Special case: decode the union "slices" field of - # MultiModalFlatField - if factory_meth_name == "flat": - field_args[0] = self._decode_nested_slices(field_args[0]) + def _decode_mm_item(self, obj: list) -> MultiModalKwargsItem: + return MultiModalKwargsItem.from_elems( + [self._decode_mm_field_elem(v) for v in obj]) - v["field"] = factory_meth(None, *field_args).field - elems.append(MultiModalFieldElem(**v)) - decoded_items.append(MultiModalKwargsItem.from_elems(elems)) - return decoded_items + def _decode_mm_field_elem(self, obj: dict) -> MultiModalFieldElem: + obj["data"] = self._decode_nested_tensors(obj["data"]) + # Reconstruct the field processor using MultiModalFieldConfig + factory_meth_name, *field_args = obj["field"] + factory_meth = getattr(MultiModalFieldConfig, factory_meth_name) + + # Special case: decode the union "slices" field of + # MultiModalFlatField + if factory_meth_name == "flat": + field_args[0] = self._decode_nested_slices(field_args[0]) + + obj["field"] = factory_meth(None, *field_args).field + return MultiModalFieldElem(**obj) def _decode_nested_tensors(self, obj: Any) -> NestedTensors: if isinstance(obj, (int, float)): From 139d155781c187b6d38ac6d84a516c97ff66bb1f Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 8 Aug 2025 00:47:10 +0800 Subject: [PATCH 070/932] [Frontend] Use engine argument to control MM cache size (#22441) Signed-off-by: DarkLight1337 --- docs/configuration/conserving_memory.md | 2 +- docs/configuration/optimization.md | 12 ++++-- examples/offline_inference/mistral-small.py | 6 +-- examples/offline_inference/vision_language.py | 4 +- .../multimodal/generation/vlm_utils/core.py | 4 +- .../multimodal/processing/test_llama4.py | 6 +-- tests/models/utils.py | 4 +- vllm/config.py | 43 ++++++++++++++----- vllm/engine/arg_utils.py | 34 ++++++++++++--- vllm/entrypoints/cli/serve.py | 7 ++- vllm/envs.py | 2 +- vllm/multimodal/registry.py | 22 +++++++--- vllm/v1/core/kv_cache_utils.py | 2 +- 13 files changed, 101 insertions(+), 47 deletions(-) diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index dcaf1069bf..058eba5fe0 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -86,7 +86,7 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", If you run out of CPU RAM, try the following options: -- (Multi-modal models only) you can set the size of multi-modal processor cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB per API process + 4 GiB per engine core process) +- (Multi-modal models only) you can set the size of multi-modal processor cache by setting `mm_processor_cache_gb` engine argument (default 4 GiB per API process + 4 GiB per engine core process) - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB). ## Multi-modal input limits diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index bb7342c93f..2eeb8ad25d 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -161,12 +161,18 @@ By default, the multi-modal processor cache is enabled to avoid repeatedly proce the same multi-modal inputs via Hugging Face `AutoProcessor`, which commonly occurs in multi-turn conversations. -You can adjust the size of the cache via `VLLM_MM_INPUT_CACHE_GIB` environment variable +You can adjust the size of the cache by setting the value of `mm_processor_cache_gb` (default 4 GiB per API process + 4 GiB per engine core process). +If you do not benefit much from the cache, you can disable it completely via `mm_processor_cache_gb=0`. -If you do not benefit much from the cache, you can disable it completely via `disable_mm_preprocessor_cache`: +Examples: ```python +# Use a larger cache llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - disable_mm_preprocessor_cache=True) + mm_processor_cache_gb=8) + +# Disable the cache +llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_cache_gb=0) ``` diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py index 59ec22a1e9..1f6e5ba146 100644 --- a/examples/offline_inference/mistral-small.py +++ b/examples/offline_inference/mistral-small.py @@ -68,7 +68,7 @@ def run_simple_demo(args: argparse.Namespace): max_model_len=4096, max_num_seqs=2, tensor_parallel_size=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + mm_processor_cache_gb=0 if args.disable_mm_processor_cache else 4, ) prompt = "Describe this image in one sentence." @@ -105,7 +105,7 @@ def run_advanced_demo(args: argparse.Namespace): limit_mm_per_prompt={"image": max_img_per_msg}, max_model_len=max_img_per_msg * max_tokens_per_img, tensor_parallel_size=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + mm_processor_cache_gb=0 if args.disable_mm_processor_cache else 4, ) prompt = "Describe the following image." @@ -164,7 +164,7 @@ def parse_args(): ) parser.add_argument( - "--disable-mm-preprocessor-cache", + "--disable-mm-processor-cache", action="store_true", help="If True, disables caching of multi-modal processor.", ) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 5dbe001994..1314d33e90 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1563,7 +1563,7 @@ def parse_args(): ) parser.add_argument( - "--disable-mm-preprocessor-cache", + "--disable-mm-processor-cache", action="store_true", help="If True, disables caching of multi-modal processor.", ) @@ -1603,7 +1603,7 @@ def main(args): engine_args = asdict(req_data.engine_args) | { "seed": args.seed, - "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache, + "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4, } llm = LLM(**engine_args) diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py index f65385150d..a5d6948f06 100644 --- a/tests/models/multimodal/generation/vlm_utils/core.py +++ b/tests/models/multimodal/generation/vlm_utils/core.py @@ -62,9 +62,7 @@ def run_test( # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). - vllm_runner_kwargs_: dict[str, Any] = { - "disable_mm_preprocessor_cache": True, - } + vllm_runner_kwargs_: dict[str, Any] = {"mm_processor_cache_gb": 0} if model_info.tokenizer: vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer if model_info.tokenizer_mode: diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py index 9ef7af5562..5e14f0f996 100644 --- a/tests/models/multimodal/processing/test_llama4.py +++ b/tests/models/multimodal/processing/test_llama4.py @@ -15,14 +15,14 @@ from ...utils import build_model_context ["meta-llama/Llama-4-Scout-17B-16E-Instruct"]) @pytest.mark.parametrize("mm_processor_kwargs", [{}]) @pytest.mark.parametrize("num_imgs", [1, 5]) -@pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False]) +@pytest.mark.parametrize("mm_processor_cache_gb", [0, 4]) @pytest.mark.parametrize("tokenized_prompt", [True, False]) def test_processor_override( image_assets: ImageTestAssets, model_id: str, mm_processor_kwargs: dict, num_imgs: int, - disable_mm_preprocessor_cache: bool, + mm_processor_cache_gb: int, tokenized_prompt: bool, ): """Ensure llama4 processor works properly.""" @@ -30,7 +30,7 @@ def test_processor_override( model_id, mm_processor_kwargs=mm_processor_kwargs, limit_mm_per_prompt={"image": num_imgs}, - disable_mm_preprocessor_cache=disable_mm_preprocessor_cache, + mm_processor_cache_gb=mm_processor_cache_gb, ) processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) config = processor.info.get_hf_config() diff --git a/tests/models/utils.py b/tests/models/utils.py index 27ce9de469..1e3d51aeec 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -261,7 +261,7 @@ def build_model_context( model_config_kwargs: Optional[dict[str, Any]] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None, limit_mm_per_prompt: Optional[dict[str, int]] = None, - disable_mm_preprocessor_cache: bool = True, + mm_processor_cache_gb: int = 0, ): """Creates an InputContext for a given model. @@ -291,7 +291,7 @@ def build_model_context( seed=0, mm_processor_kwargs=mm_processor_kwargs, limit_mm_per_prompt=limit_mm_per_prompt, - disable_mm_preprocessor_cache=disable_mm_preprocessor_cache, + mm_processor_cache_gb=mm_processor_cache_gb, hf_overrides=model_info.hf_overrides, **model_config_kwargs, ) diff --git a/vllm/config.py b/vllm/config.py index 44a8d871f0..8dcd429a6b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -443,8 +443,15 @@ class ModelConfig: from `AutoProcessor.from_pretrained`. The available overrides depend on the model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`. """ - disable_mm_preprocessor_cache: bool = False - """If `True`, disable caching of the multi-modal processor.""" + mm_processor_cache_gb: int = 4 + """The size (in GiB) of the multi-modal processor cache, which is used to + avoid re-processing past multi-modal inputs. + + This cache is duplicated for each API process and engine core process, + resulting in a total memory usage of + `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. + + Set to `0` to disable this cache completely (not recommended).""" override_neuron_config: dict[str, Any] = field(default_factory=dict) """Initialize non-default neuron config or override default neuron config that are specific to Neuron devices, this argument will be used to @@ -881,17 +888,16 @@ class ModelConfig: limit_per_prompt=self.limit_mm_per_prompt, media_io_kwargs=self.media_io_kwargs, mm_processor_kwargs=self.mm_processor_kwargs, - disable_mm_preprocessor_cache=self. - disable_mm_preprocessor_cache, + mm_processor_cache_gb=self.mm_processor_cache_gb, interleave_mm_strings=self.interleave_mm_strings) return None - def set_disable_mm_preprocessor_cache(self, value: bool) -> None: + def set_mm_processor_cache_gb(self, value: int) -> None: mm_config = self.get_multimodal_config() - self.disable_mm_preprocessor_cache = value - mm_config.disable_mm_preprocessor_cache = value + self.mm_processor_cache_gb = value + mm_config.mm_processor_cache_gb = value def _get_encoder_config(self): return get_sentence_transformer_tokenizer_config( @@ -1698,7 +1704,16 @@ class ModelConfig: if mm_config is None: return False - return not mm_config.disable_mm_preprocessor_cache + return mm_config.mm_processor_cache_gb > 0 + + @property + def enable_mm_processor_cache(self) -> bool: + """Whether the multi-modal processor cache should be enabled.""" + mm_config = self.multimodal_config + if mm_config is None: + return False + + return mm_config.mm_processor_cache_gb > 0 @property def enable_mm_input_cache(self) -> bool: @@ -1707,7 +1722,7 @@ class ModelConfig: if mm_config is None: return False - return not mm_config.disable_mm_preprocessor_cache + return mm_config.mm_processor_cache_gb > 0 def get_mm_input_cache_gb(self) -> int: mm_config = self.multimodal_config @@ -3391,9 +3406,15 @@ class MultiModalConfig: `{"num_crops": 4}`. """ - disable_mm_preprocessor_cache: bool = False + mm_processor_cache_gb: int = 4 """ - If `True`, disable caching of the multi-modal processor. + The size (in GiB) of the multi-modal processor cache, which is used to + + This cache is duplicated for each API process and engine core process, + resulting in a total memory usage of + `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. + + Set to `0` to disable this cache completely (not recommended). """ interleave_mm_strings: bool = False diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a18cd9dde3..d2153dfae3 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -358,8 +358,8 @@ class EngineArgs: "media_io_kwargs") mm_processor_kwargs: Optional[Dict[str, Any]] = \ MultiModalConfig.mm_processor_kwargs - disable_mm_preprocessor_cache: bool = \ - MultiModalConfig.disable_mm_preprocessor_cache + disable_mm_preprocessor_cache: bool = False # DEPRECATED + mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb # LoRA fields enable_lora: bool = False enable_lora_bias: bool = LoRAConfig.bias_enabled @@ -720,8 +720,11 @@ class EngineArgs: "--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"]) multimodal_group.add_argument( - "--disable-mm-preprocessor-cache", - **multimodal_kwargs["disable_mm_preprocessor_cache"]) + "--mm-processor-cache-gb", + **multimodal_kwargs["mm_processor_cache_gb"]) + multimodal_group.add_argument("--disable-mm-preprocessor-cache", + type=bool, + deprecated=True) multimodal_group.add_argument( "--interleave-mm-strings", **multimodal_kwargs["interleave_mm_strings"]) @@ -886,6 +889,23 @@ class EngineArgs: self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}" self.load_format = "runai_streamer" + if self.disable_mm_preprocessor_cache: + logger.warning( + "`--disable-mm-preprocessor-cache` is deprecated " + "and will be removed in v0.13. " + "Please use `--mm-processor-cache-gb 0` instead.", ) + + self.mm_processor_cache_gb = 0 + elif envs.VLLM_MM_INPUT_CACHE_GIB != 4: + logger.warning( + "VLLM_MM_INPUT_CACHE_GIB` is deprecated " + "and will be removed in v0.13. " + "Please use `--mm-processor-cache-gb %d` instead.", + envs.VLLM_MM_INPUT_CACHE_GIB, + ) + + self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB + return ModelConfig( model=self.model, hf_config_path=self.hf_config_path, @@ -922,7 +942,7 @@ class EngineArgs: use_async_output_proc=not self.disable_async_output_proc, config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, - disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache, + mm_processor_cache_gb=self.mm_processor_cache_gb, override_neuron_config=self.override_neuron_config, override_pooler_config=self.override_pooler_config, logits_processor_pattern=self.logits_processor_pattern, @@ -1234,13 +1254,13 @@ class EngineArgs: dp_supports_mm_processor_cache = (self.data_parallel_size == 1 or data_parallel_external_lb) if (not dp_supports_mm_processor_cache - and not model_config.disable_mm_preprocessor_cache): + and model_config.mm_processor_cache_gb > 0): logger.warning( "Multi-modal processor cache is disabled because " "it is not compatible with data parallelism when " "there does not exist a one-to-one correspondance " "between API and engine core processes.") - model_config.set_disable_mm_preprocessor_cache(True) + model_config.set_mm_processor_cache_gb(0) speculative_config = self.create_speculative_config( target_model_config=model_config, diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 02b78f103c..803a3e0046 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -138,13 +138,13 @@ def run_multi_api_server(args: argparse.Namespace): num_api_servers = args.api_server_count assert num_api_servers > 0 - orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache + orig_mm_processor_cache_gb = args.mm_processor_cache_gb if num_api_servers > 1: setup_multiprocess_prometheus() # Not compatible with API server scale-out - args.disable_mm_preprocessor_cache = True + args.mm_processor_cache_gb = 0 listen_address, sock = setup_server(args) @@ -161,8 +161,7 @@ def run_multi_api_server(args: argparse.Namespace): raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used " "with api_server_count > 1") - if model_config.is_multimodal_model and not ( - orig_disable_mm_preprocessor_cache): + if model_config.is_multimodal_model and orig_mm_processor_cache_gb > 0: logger.warning("Multi-modal processor cache is disabled because " "it is not compatible with `api_server_count > 1`.") diff --git a/vllm/envs.py b/vllm/envs.py index 212eaf015a..8b12a7ee2b 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -561,7 +561,7 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_VIDEO_LOADER_BACKEND": lambda: os.getenv("VLLM_VIDEO_LOADER_BACKEND", "opencv"), - # Cache size (in GiB per process) for multimodal input cache + # [DEPRECATED] Cache size (in GiB per process) for multimodal input cache # Default is 4 GiB per API process + 4 GiB per engine core process "VLLM_MM_INPUT_CACHE_GIB": lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")), diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 5f5b620e0c..dca04e9a1e 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar import torch.nn as nn -from vllm.envs import VLLM_MM_INPUT_CACHE_GIB from vllm.inputs import InputProcessingContext from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import (AnyTokenizer, @@ -96,11 +95,22 @@ class MultiModalRegistry: self._processor_factories = ClassRegistry[nn.Module, _ProcessorFactories]() - self._processing_cache = ProcessingCache(VLLM_MM_INPUT_CACHE_GIB) + self._processor_cache: Optional[ProcessingCache] = None + + def _get_processor_cache(self, model_config: "ModelConfig"): + capacity_gb = model_config.mm_processor_cache_gb + if capacity_gb is None: + return None # Overrides `disable_cache` argument + + if self._processor_cache is None: + self._processor_cache = ProcessingCache(capacity_gb) + + return self._processor_cache def reset_processor_cache(self) -> bool: """Reset the multi-modal processing cache.""" - self._processing_cache.reset() + if self._processor_cache: + self._processor_cache.reset() return True # Success @@ -244,14 +254,14 @@ class MultiModalRegistry: if tokenizer is None and not model_config.skip_tokenizer_init: tokenizer = cached_tokenizer_from_config(model_config) if disable_cache is None: - mm_config = model_config.get_multimodal_config() - disable_cache = mm_config.disable_mm_preprocessor_cache + disable_cache = not model_config.enable_mm_processor_cache model_cls = self._get_model_cls(model_config) factories = self._processor_factories[model_cls] ctx = InputProcessingContext(model_config, tokenizer) - cache = None if disable_cache else self._processing_cache + cache = None if disable_cache else self._get_processor_cache( + model_config) return factories.build_processor(ctx, cache=cache) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 38b1d9b13f..626aa35a77 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -430,7 +430,7 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int, raise ValueError( "The number of multi-modal positions and hashes must match. This " "is likely because you did not enable MM hashing. " - "Please set `disable_mm_preprocessor_cache=False`.") + "Please set `mm_processor_cache_gb > 0`.") # Note that we assume mm_positions is sorted by offset. # We do not need to check all mm inputs if the start token index is out of From 7e3a8dc90670fd312ce1e0d4eba9bf11c571e3ad Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 7 Aug 2025 18:13:04 +0100 Subject: [PATCH 071/932] Remove `from_dict` from `SpeculativeConfig` (#22451) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/v1/spec_decode/test_ngram.py | 13 ++++++------- vllm/config.py | 5 ----- vllm/engine/arg_utils.py | 19 +++---------------- 3 files changed, 9 insertions(+), 28 deletions(-) diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py index c844925e6c..b7303e0443 100644 --- a/tests/v1/spec_decode/test_ngram.py +++ b/tests/v1/spec_decode/test_ngram.py @@ -47,13 +47,12 @@ def test_ngram_proposer(): model_config = ModelConfig(model="facebook/opt-125m") return NgramProposer( vllm_config=VllmConfig(model_config=model_config, - speculative_config=SpeculativeConfig. - from_dict({ - "prompt_lookup_min": min_n, - "prompt_lookup_max": max_n, - "num_speculative_tokens": k, - "method": "ngram", - }))) + speculative_config=SpeculativeConfig( + prompt_lookup_min=min_n, + prompt_lookup_max=max_n, + num_speculative_tokens=k, + method="ngram", + ))) # No match. result = ngram_proposer( diff --git a/vllm/config.py b/vllm/config.py index 8dcd429a6b..7147702edd 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2895,11 +2895,6 @@ class SpeculativeConfig: usedforsecurity=False).hexdigest() return hash_str - @classmethod - def from_dict(cls, dict_value: dict) -> "SpeculativeConfig": - """Parse the CLI value for the speculative config.""" - return cls(**dict_value) - @staticmethod def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: if hf_config.model_type == "deepseek_v3": diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d2153dfae3..c0ac3ff631 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -757,18 +757,6 @@ class EngineArgs: lora_group.add_argument("--default-mm-loras", **lora_kwargs["default_mm_loras"]) - # Speculative arguments - speculative_group = parser.add_argument_group( - title="SpeculativeConfig", - description=SpeculativeConfig.__doc__, - ) - speculative_group.add_argument( - "--speculative-config", - type=json.loads, - default=None, - help="The configurations for speculative decoding. Should be a " - "JSON string.") - # Observability arguments observability_kwargs = get_kwargs(ObservabilityConfig) observability_group = parser.add_argument_group( @@ -848,6 +836,8 @@ class EngineArgs: title="VllmConfig", description=VllmConfig.__doc__, ) + vllm_group.add_argument("--speculative-config", + **vllm_kwargs["speculative_config"]) vllm_group.add_argument("--kv-transfer-config", **vllm_kwargs["kv_transfer_config"]) vllm_group.add_argument('--kv-events-config', @@ -1033,10 +1023,7 @@ class EngineArgs: "enable_chunked_prefill": enable_chunked_prefill, "disable_log_stats": disable_log_stats, }) - speculative_config = SpeculativeConfig.from_dict( - self.speculative_config) - - return speculative_config + return SpeculativeConfig(**self.speculative_config) def create_engine_config( self, From acf8aeb79e23c32217dd37b5e96847302ae4d0b7 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Fri, 8 Aug 2025 09:57:27 +0800 Subject: [PATCH 072/932] [Misc] normalize multiprocessing Queue usage (#22371) Signed-off-by: Andy Xie --- tests/test_sharded_state_loader.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py index 1bb4203d21..42afdfa3c7 100644 --- a/tests/test_sharded_state_loader.py +++ b/tests/test_sharded_state_loader.py @@ -118,8 +118,17 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available, tensor_parallel_size=tp_size, )) p.start() - p.join() + # Call queue.get() before p.join() to prevent deadlock: + # If p.join() is called before queue.get() and the queue is full, + # the child process may block while writing to the queue and never + # terminate, causing the parent to wait indefinitely on p.join(). + # See: https://github.com/vllm-project/vllm/pull/22371#discussion_r2257773814 out_before = queue.get() + p.join() + queue.close() + queue.join_thread() + + queue = ctx.Queue() p = ctx.Process(target=_run_generate, args=(output_dir, queue), @@ -131,7 +140,14 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available, load_format="sharded_state", )) p.start() - p.join() + # Call queue.get() before p.join() to prevent deadlock: + # If p.join() is called before queue.get() and the queue is full, + # the child process may block while writing to the queue and never + # terminate, causing the parent to wait indefinitely on p.join(). + # See: https://github.com/vllm-project/vllm/pull/22371#discussion_r2257773814 out_after = queue.get() + p.join() + queue.close() + queue.join_thread() assert out_before == out_after From 1ee5ead5f8f1c3c77b73effcb230ee02952fbe1f Mon Sep 17 00:00:00 2001 From: TJian Date: Thu, 7 Aug 2025 19:13:17 -0700 Subject: [PATCH 073/932] [ROCm] [V1] [SpecDec] Enable Speculative Decoding on ROCm V1 Engine (#21496) Signed-off-by: tjtanaa --- tests/utils.py | 16 ++++++++ tests/v1/attention/utils.py | 7 +++- tests/v1/e2e/test_spec_decode.py | 15 ++++++++ tests/v1/spec_decode/test_eagle.py | 55 +++++++++++++++++++++++----- tests/v1/spec_decode/test_max_len.py | 54 +++++++++++++++------------ vllm/v1/spec_decode/eagle.py | 22 ++++++++--- 6 files changed, 128 insertions(+), 41 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 1c1a1cc601..741b4401cc 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -986,3 +986,19 @@ def has_module_attribute(module_name, attribute_name): return hasattr(module, attribute_name) except ImportError: return False + + +def get_attn_backend_list_based_on_platform() -> list[str]: + if current_platform.is_cuda(): + return ["FLASH_ATTN_VLLM_V1", "TRITON_ATTN_VLLM_V1", "TREE_ATTN"] + elif current_platform.is_rocm(): + attn_backend_list = ["TRITON_ATTN_VLLM_V1"] + try: + import aiter # noqa: F401 + attn_backend_list.append("FLASH_ATTN_VLLM_V1") + except Exception: + print("Skip FLASH_ATTN_VLLM_V1 on ROCm as aiter is not installed") + + return attn_backend_list + else: + raise ValueError("Unsupported platform") diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index e9e574501d..a4e38eb32f 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -11,7 +11,7 @@ import torch from vllm.config import (CacheConfig, CompilationConfig, DeviceConfig, LoadConfig, ModelConfig, ModelDType, ParallelConfig, SchedulerConfig, VllmConfig) -from vllm.platforms import _Backend +from vllm.platforms import _Backend, current_platform from vllm.utils import resolve_obj_by_qualname from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.kv_cache_interface import FullAttentionSpec @@ -119,7 +119,10 @@ def get_attention_backend(backend_name: _Backend): """ backend_map = { _Backend.FLASH_ATTN_VLLM_V1: - "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend", + ("vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" + if current_platform.is_cuda() else + "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend" + ), _Backend.FLASHINFER_VLLM_V1: "vllm.v1.attention.backends.flashinfer.FlashInferBackend", _Backend.FLEX_ATTENTION: diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 31f25e94c5..4950faf826 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -8,10 +8,12 @@ from typing import Any, Union import pytest import torch +from tests.utils import get_attn_backend_list_based_on_platform from vllm import LLM, SamplingParams from vllm.assets.base import VLLM_S3_BUCKET_URL from vllm.assets.image import VLM_IMAGES_DIR from vllm.distributed import cleanup_dist_env_and_memory +from vllm.platforms import current_platform def get_test_prompts(mm_enabled: bool): @@ -141,11 +143,14 @@ def test_ngram_correctness( marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), ], ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle", "llama4_eagle_mm"]) +@pytest.mark.parametrize("attn_backend", + get_attn_backend_list_based_on_platform()) def test_eagle_correctness( monkeypatch: pytest.MonkeyPatch, sampling_config: SamplingParams, model_setup: tuple[str, str, str, int], mm_enabled: bool, + attn_backend: str, ): # Generate test prompts inside the function instead of using fixture test_prompts = get_test_prompts(mm_enabled) @@ -156,6 +161,16 @@ def test_eagle_correctness( ''' with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") + m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) + + if (attn_backend == "TRITON_ATTN_VLLM_V1" + and not current_platform.is_rocm()): + pytest.skip("TRITON_ATTN_VLLM_V1 does not support " + "multi-token eagle spec decode on current platform") + + if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm(): + m.setenv("VLLM_ROCM_USE_AITER", "1") + method, model_name, spec_model_name, tp_size = model_setup ref_llm = LLM(model=model_name, diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 73b47f8974..2b4f8bd2a8 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -6,6 +6,7 @@ from unittest import mock import pytest import torch +from tests.utils import get_attn_backend_list_based_on_platform from tests.v1.attention.utils import (BatchSpec, _Backend, create_common_attn_metadata, create_standard_kv_cache_spec, @@ -120,17 +121,28 @@ def test_prepare_inputs(): assert torch.equal(token_indices, expected_token_indices) -@pytest.mark.parametrize("method,proposer_helper", [ - ("eagle", lambda k: _create_proposer("eagle", k)), - ("eagle3", lambda k: _create_proposer("eagle3", k)), -]) +@pytest.mark.parametrize("method", ["eagle", "eagle3"]) +@pytest.mark.parametrize("attn_backend", + get_attn_backend_list_based_on_platform()) @pytest.mark.parametrize("pp_size", [1, 2]) @pytest.mark.parametrize("use_distinct_embed_tokens", [True, False]) @mock.patch('vllm.v1.spec_decode.eagle.get_pp_group') @mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config') @mock.patch('vllm.v1.spec_decode.eagle.get_model') def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method, - proposer_helper, pp_size, use_distinct_embed_tokens): + attn_backend, pp_size, use_distinct_embed_tokens, + monkeypatch): + + monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend) + + if (attn_backend == "TRITON_ATTN_VLLM_V1" + and not current_platform.is_rocm()): + pytest.skip("TRITON_ATTN_VLLM_V1 does not support " + "multi-token eagle spec decode on current platform") + + if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm(): + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") + # Setup draft model mock mock_model = mock.MagicMock() if use_distinct_embed_tokens: @@ -177,7 +189,7 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method, target_model.lm_head = mock.MagicMock() # Create proposer using the helper function - proposer = proposer_helper(k=8) + proposer = _create_proposer(method, k=8) # Call the method under test proposer.load_model(target_model) @@ -201,10 +213,22 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method, target_model.model.embed_tokens +@pytest.mark.parametrize("method", ["eagle", "eagle3"]) +@pytest.mark.parametrize("attn_backend", + get_attn_backend_list_based_on_platform()) @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8]) -@pytest.mark.parametrize("backend", - [_Backend.FLASH_ATTN_VLLM_V1, _Backend.TREE_ATTN]) -def test_propose(num_speculative_tokens, backend): +def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): + + monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend) + + if (attn_backend == "TRITON_ATTN_VLLM_V1" + and not current_platform.is_rocm()): + pytest.skip("TRITON_ATTN_VLLM_V1 does not support " + "multi-token eagle spec decode on current platform") + + if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm(): + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") + # Use GPU device device = torch.device(current_platform.device_type) @@ -303,7 +327,18 @@ def test_propose(num_speculative_tokens, backend): device=device) sampling_metadata = mock.MagicMock() - attn_metadata_builder_cls, _ = get_attention_backend(backend) + if attn_backend == "FLASH_ATTN_VLLM_V1": + attn_metadata_builder_cls, _ = get_attention_backend( + _Backend.FLASH_ATTN_VLLM_V1) + elif attn_backend == "TRITON_ATTN_VLLM_V1": + attn_metadata_builder_cls, _ = get_attention_backend( + _Backend.TRITON_ATTN_VLLM_V1) + elif attn_backend == "TREE_ATTN": + attn_metadata_builder_cls, _ = get_attention_backend( + _Backend.TREE_ATTN) + else: + raise ValueError(f"Unsupported attention backend: {attn_backend}") + attn_metadata_builder = attn_metadata_builder_cls( kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config), layer_names=proposer.attn_layer_names, diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py index 9070d2b10f..fef6a5421b 100644 --- a/tests/v1/spec_decode/test_max_len.py +++ b/tests/v1/spec_decode/test_max_len.py @@ -4,7 +4,9 @@ import pytest +from tests.utils import get_attn_backend_list_based_on_platform from vllm import LLM, SamplingParams +from vllm.platforms import current_platform _PROMPTS = [ "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1", @@ -14,36 +16,40 @@ _PROMPTS = [ @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 10]) -def test_ngram_max_len( - monkeypatch: pytest.MonkeyPatch, - num_speculative_tokens: int, -): - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - - llm = LLM( - model="facebook/opt-125m", - max_model_len=100, - enforce_eager=True, # For faster initialization. - speculative_config={ - "method": "ngram", - "prompt_lookup_max": 5, - "prompt_lookup_min": 3, - "num_speculative_tokens": num_speculative_tokens, - }, - ) - sampling_params = SamplingParams(max_tokens=100, ignore_eos=True) - llm.generate(_PROMPTS, sampling_params) +def test_ngram_max_len(num_speculative_tokens: int): + llm = LLM( + model="facebook/opt-125m", + max_model_len=100, + enforce_eager=True, # For faster initialization. + speculative_config={ + "method": "ngram", + "prompt_lookup_max": 5, + "prompt_lookup_min": 3, + "num_speculative_tokens": num_speculative_tokens, + }, + ) + sampling_params = SamplingParams(max_tokens=100, ignore_eos=True) + llm.generate(_PROMPTS, sampling_params) @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 10]) -def test_eagle_max_len( - monkeypatch: pytest.MonkeyPatch, - num_speculative_tokens: int, -): +@pytest.mark.parametrize("attn_backend", + get_attn_backend_list_based_on_platform()) +def test_eagle_max_len(monkeypatch: pytest.MonkeyPatch, + num_speculative_tokens: int, attn_backend: str): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") + m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) + + if (attn_backend == "TRITON_ATTN_VLLM_V1" + and not current_platform.is_rocm()): + pytest.skip("TRITON_ATTN_VLLM_V1 does not support " + "multi-token eagle spec decode on current platform") + + if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm(): + m.setenv("VLLM_ROCM_USE_AITER", "1") + llm = LLM( model="meta-llama/Meta-Llama-3-8B-Instruct", enforce_eager=True, # For faster initialization. diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 3c36971fe5..f75d76dd97 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -17,10 +17,14 @@ from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model from vllm.model_executor.models import supports_multimodal from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM +from vllm.platforms import current_platform from vllm.utils import is_pin_memory_available from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata +from vllm.v1.attention.backends.rocm_aiter_fa import ( + AiterFlashAttentionMetadata) from vllm.v1.attention.backends.tree_attn import (TreeAttentionMetadata, TreeAttentionMetadataBuilder) +from vllm.v1.attention.backends.triton_attn import TritonAttentionMetadata from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.sample.metadata import SamplingMetadata @@ -230,11 +234,19 @@ class EagleProposer: # one layer. Adapt this code to support multiple layers once # there's a multi-layer MTP module. - # Currently, only FlashAttention and TreeAttention support multi-token - # eagle spec decode. This is because the code below - # makes assumptions about attn_metadata attributes available. - assert isinstance(attn_metadata, - (FlashAttentionMetadata, TreeAttentionMetadata)) + # On ROCm, both AiterFlashAttention and TritonAttention + # support multi-token eagle spec decode. + if current_platform.is_rocm(): + assert isinstance( + attn_metadata, + (TritonAttentionMetadata, AiterFlashAttentionMetadata, + FlashAttentionMetadata)) + else: + # Currently, only FlashAttention and TreeAttention support + # multi-token eagle spec decode. This is because the code below + # makes assumptions about attn_metadata attributes available. + assert isinstance(attn_metadata, + (FlashAttentionMetadata, TreeAttentionMetadata)) # Generate the remaining draft tokens. draft_token_ids_list = [draft_token_ids] From e2c8f1edec24f7a89a68e3b48bc65ae683aed0cb Mon Sep 17 00:00:00 2001 From: Andrew Sansom Date: Thu, 7 Aug 2025 21:15:32 -0500 Subject: [PATCH 074/932] [PERF] Use pybase64 to more quickly decode prompt embeddings (#22469) Signed-off-by: Andrew Sansom --- vllm/entrypoints/openai/serving_engine.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index efd2f20299..fb9d456df7 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio -import base64 import io import json import sys @@ -12,6 +11,7 @@ from http import HTTPStatus from typing import (Annotated, Any, Callable, ClassVar, Generic, Optional, TypeVar, Union, cast, overload) +import pybase64 import torch from fastapi import Request from pydantic import BaseModel, ConfigDict, Field @@ -1008,7 +1008,8 @@ class OpenAIServing: ) -> list[EmbedsPrompt]: def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt: - tensor = torch.load(io.BytesIO(base64.b64decode(embed)), + tensor = torch.load(io.BytesIO( + pybase64.b64decode(embed, validate=True)), weights_only=True) assert isinstance(tensor, torch.Tensor) and tensor.dtype in ( torch.float32, From d57dc2364e88e9a1a3e8dc3f6ff8486a7ba040dd Mon Sep 17 00:00:00 2001 From: Zhiyu Date: Thu, 7 Aug 2025 19:18:19 -0700 Subject: [PATCH 075/932] Add ModelOpt Qwen3 nvfp4 support (#20101) Signed-off-by: Zhiyu Cheng --- .../model_loader/weight_utils.py | 66 ++++++++++--------- vllm/model_executor/models/qwen2.py | 13 +++- vllm/model_executor/models/qwen3_moe.py | 16 ++++- 3 files changed, 58 insertions(+), 37 deletions(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 074126fa66..78b186265d 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -764,39 +764,41 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]: return None return remapped_name - possible_scale_names = [".k_scale", ".v_scale"] - modelopt_scale_names = [ - ".self_attn.k_proj.k_scale", ".self_attn.v_proj.v_scale" + # Define scale name mapping patterns in order of precedence + scale_mapping_patterns = [ + # ModelOpt format: .self_attn.{k,v}_proj.{k,v}_scale -> + # .self_attn.attn.{k,v}_scale + (r"\.self_attn\.([kv])_proj\.([kv])_scale$", + r".self_attn.attn.\2_scale"), + # QKV proj format: .self_attn.qkv_proj.{k,v}_scale -> + # .self_attn.attn.{k,v}_scale + (r"\.self_attn\.qkv_proj\.([kv])_scale$", r".self_attn.attn.\1_scale"), + # Qwen3 MoE format: .self_attn.qkqkv_proj.{k,v}_scale -> + # .self_attn.attn.{k,v}_scale + (r"\.self_attn\.qkqkv_proj\.([kv])_scale$", r".self_attn.attn.\1_scale" + ), + # Default format: .{k,v}_scale -> .attn.{k,v}_scale + (r"\.([kv])_scale$", r".attn.\1_scale"), ] - # Also support qkv_proj scale parameters (from stacked parameter processing) - qkv_proj_scale_names = [ - ".self_attn.qkv_proj.k_scale", ".self_attn.qkv_proj.v_scale" - ] - for scale_name in possible_scale_names: - if name.endswith(scale_name): - if any(mo_scale_name in name - for mo_scale_name in modelopt_scale_names): - remapped_name = name.replace( - f".self_attn.{scale_name[1]}_proj{scale_name}", - f".self_attn.attn{scale_name}") - elif any(qkv_scale_name in name - for qkv_scale_name in qkv_proj_scale_names): - # Handle qkv_proj scale parameters - remapped_name = name.replace( - f".self_attn.qkv_proj{scale_name}", - f".self_attn.attn{scale_name}") - else: - remapped_name = name.replace(scale_name, f".attn{scale_name}") - if remapped_name not in params_dict: - logger.warning_once( - "Found %s in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). %s is not loaded.", # noqa: E501 - scale_name, - name, - remapped_name, - scale_name, - ) - return None - return remapped_name + + # Check if name ends with k_scale or v_scale + if name.endswith((".k_scale", ".v_scale")): + import regex as re + + for pattern, replacement in scale_mapping_patterns: + if re.search(pattern, name): + remapped_name = re.sub(pattern, replacement, name) + if remapped_name not in params_dict: + scale_type = name.split(".")[-1] + logger.warning_once( + "Found %s in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). %s is not loaded.", # noqa: E501 + scale_type, + name, + remapped_name, + scale_type, + ) + return None + return remapped_name # If there were no matches, return the untouched param name return name diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 0e7507a457..e4f0de04e9 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -408,9 +408,18 @@ class Qwen2Model(nn.Module): continue if is_pp_missing_parameter(name, self): continue + if name.endswith("scale"): + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if weight_loader == default_weight_loader: + weight_loader(param, loaded_weight) + else: + weight_loader(param, loaded_weight, shard_id) break else: # Skip loading extra bias for GPTQ models. diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 7410589190..b2397c115d 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -48,7 +48,8 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors @@ -471,12 +472,21 @@ class Qwen3MoeModel(nn.Module): # Skip layers on other devices. if is_pp_missing_parameter(name, self): continue + if name.endswith("scale"): + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue if name not in params_dict: continue param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if weight_loader == default_weight_loader: + weight_loader(param, loaded_weight) + else: + weight_loader(param, loaded_weight, shard_id) break else: is_expert_weight = False From a3b9c17b56d09a091e210222d8e1f75cabe65b84 Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Thu, 7 Aug 2025 21:18:22 -0500 Subject: [PATCH 076/932] Support Tensorrt-LLM MoE fp4 for low-latency (#21331) Signed-off-by: Shu Wang Signed-off-by: Po-Han Huang Signed-off-by: Shu Wang. Signed-off-by: XIn Li Co-authored-by: XIn Li --- vllm/envs.py | 15 + .../model_executor/layers/fused_moe/config.py | 3 +- .../compressed_tensors_moe.py | 9 +- .../layers/quantization/modelopt.py | 284 ++++++++++++++++-- .../quantization/utils/flashinfer_fp4_moe.py | 12 +- .../quantization/utils/nvfp4_moe_support.py | 4 +- vllm/utils/flashinfer.py | 4 + 7 files changed, 288 insertions(+), 43 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 8b12a7ee2b..f81f6dacd8 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -129,6 +129,7 @@ if TYPE_CHECKING: VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False VLLM_USE_FLASHINFER_MOE_FP8: bool = False VLLM_USE_FLASHINFER_MOE_FP4: bool = False + VLLM_FLASHINFER_MOE_BACKEND: str = "throughput" VLLM_XGRAMMAR_CACHE_MB: int = 0 VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256 VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False @@ -982,6 +983,20 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_ALL2ALL_BACKEND": lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"), + # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support. Both + # require compute capability 10.0 or above. + # Available options: + # - "throughput": [default] + # Uses CUTLASS kernels optimized for high-throughput batch inference. + # - "latency": + # Uses TensorRT-LLM kernels optimized for low-latency inference. + # To set this backend, define the environment variable: + # export VLLM_FLASHINFER_MOE_BACKEND=latency. + # If not set, defaults to "throughput". + "VLLM_FLASHINFER_MOE_BACKEND": lambda: os.getenv( + "VLLM_FLASHINFER_MOE_BACKEND", "throughput" + ), + # Control the maximum number of tokens per expert supported by the # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for # the blockscale tensor of activations NVFP4 Quantization. diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 9e4ee5a3d7..f2242ade0c 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -192,7 +192,8 @@ class FusedMoEParallelConfig: @property def use_flashinfer_cutlass_kernels(self): return (envs.VLLM_USE_FLASHINFER_MOE_FP4 - and has_flashinfer_cutlass_fused_moe()) + and has_flashinfer_cutlass_fused_moe() + and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput") @staticmethod def make(tp_size_: int, dp_size_: int, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 09d8890888..c04f7c39a5 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -105,7 +105,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): detect_nvfp4_moe_support) _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__) self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported - self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass + self.allow_flashinfer = _nvfp4.allow_flashinfer self.use_marlin = _nvfp4.use_marlin self.group_size = 16 self.fused_experts = None # type: ignore[assignment] @@ -212,7 +212,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): requires_grad=False) # reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel. - if self.allow_flashinfer_cutlass: + if self.allow_flashinfer: w, s = reorder_w1w3_to_w3w1(layer.w13_weight.data, layer.w13_weight_scale.data, dim=-2) @@ -266,7 +266,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): (layer.w2_input_global_scale), requires_grad=False) def maybe_swap_experts_impl(self, moe_parallel_config): - if not self.allow_flashinfer_cutlass: + if not self.allow_flashinfer: return self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel( moe_parallel_config) @@ -277,8 +277,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( # noqa: E501 select_nvfp4_gemm_impl) - return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe, - logger) + return select_nvfp4_gemm_impl(self.allow_flashinfer, moe, logger) def apply( self, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 0334a28245..147b275eaf 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from enum import Enum from typing import Any, Callable, Optional, Union import torch @@ -36,6 +37,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( from vllm.model_executor.parameter import (ModelWeightParameter, PerTensorScaleParameter) from vllm.scalar_type import scalar_types +from vllm.utils import next_power_of_2 from vllm.utils.flashinfer import has_flashinfer_moe logger = init_logger(__name__) @@ -44,6 +46,11 @@ QUANT_ALGOS = ["FP8", "NVFP4"] KV_CACHE_QUANT_ALGOS = ["FP8"] +class FlashinferMoeBackend(Enum): + TENSORRT_LLM = "TensorRT-LLM" + CUTLASS = "CUTLASS" + + class ModelOptFp8Config(QuantizationConfig): """Config class for ModelOpt FP8.""" @@ -185,7 +192,7 @@ class ModelOptFp8LinearMethod(LinearMethodBase): Args: quant_config: The ModelOpt quantization config. """ - def __init__(self, quant_config: ModelOptFp8Config): + def __init__(self, quant_config: ModelOptFp8Config) -> None: self.quant_config = quant_config self.fp8_linear = Fp8LinearOp( act_quant_static=True, act_quant_group_shape=GroupShape.PER_TENSOR) @@ -265,7 +272,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): quant_config: The ModelOpt quantization config. """ - def __init__(self, quant_config: ModelOptFp8Config): + def __init__(self, quant_config: ModelOptFp8Config) -> None: self.quant_config = quant_config from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( cutlass_fp8_supported) @@ -670,7 +677,8 @@ class ModelOptNvFp4Config(QuantizationConfig): return cls(is_checkpoint_nvfp4_serialized, kv_cache_quant_algo, exclude_modules, group_size) - def is_layer_excluded(self, prefix: str, exclude_modules: list): + def is_layer_excluded(self, prefix: str, + exclude_modules: list[str]) -> bool: import regex as re for pattern in exclude_modules: regex_str = pattern.replace('.', r'\.').replace('*', r'.*') @@ -714,7 +722,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase): Args: quant_config: The ModelOpt quantization config. """ - def __init__(self, quant_config: ModelOptNvFp4Config): + def __init__(self, quant_config: ModelOptNvFp4Config) -> None: self.quant_config = quant_config self.cutlass_nvfp4_supported = cutlass_fp4_supported() self.use_marlin = False @@ -859,6 +867,16 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase): return out.view(*output_shape) +def _get_tile_tokens_dim(num_tokens: int, top_k: int, num_experts: int) -> int: + # Guess tokens per expert assuming perfect expert distribution first. + num_tokens_per_expert = (num_tokens * top_k) // num_experts + # And pad the number to the next power of 2. + tile_tokens_dim = next_power_of_2(num_tokens_per_expert) + # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel. + tile_tokens_dim = min(max(tile_tokens_dim, 8), 64) + return tile_tokens_dim + + class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): """ MoE Method for FP4 Quantization. @@ -866,22 +884,40 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): quant_config: NVFP4 Quant Config """ - def __init__(self, quant_config: ModelOptNvFp4Config): + def __init__(self, quant_config: ModelOptNvFp4Config) -> None: self.quant_config = quant_config from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501 detect_nvfp4_moe_support) _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__) self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported - self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass + self.allow_flashinfer = _nvfp4.allow_flashinfer self.use_marlin = _nvfp4.use_marlin + self.flashinfer_moe_backend = None - self.fused_experts = None # type: ignore + if self.allow_flashinfer: + flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND + if flashinfer_moe_backend == "throughput": + self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS + logger.info_once("Using FlashInfer CUTLASS kernels for " + "ModelOptNvFp4FusedMoE.") + elif flashinfer_moe_backend == "latency": + self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM + logger.info_once("Using FlashInfer TensorRT-LLM kernels for " + "ModelOptNvFp4FusedMoE.") + else: + allowed_backends = ["throughput", "latency"] + raise ValueError( + f"Unknown flashinfer moe backend: {flashinfer_moe_backend}" + f" expected one of {allowed_backends}") + + self.fused_experts: Optional[ + mk.FusedMoEModularKernel] = None # type: ignore[assignment] def maybe_swap_experts_impl( self, moe_parallel_config: FusedMoEParallelConfig, ): - if not self.allow_flashinfer_cutlass: + if not self.allow_flashinfer: return self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel( moe_parallel_config) @@ -897,8 +933,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( # noqa: E501 select_nvfp4_gemm_impl) - return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe, - logger) + return select_nvfp4_gemm_impl(self.allow_flashinfer, moe, logger) def uses_weight_scale_2_pattern(self) -> bool: """ @@ -996,14 +1031,101 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): weight_loader=weight_loader) layer.register_parameter("w2_input_scale", w2_input_scale) + def prepare_static_weight_layouts_for_trtllm_moe( + self, + gemm1_weights: torch.Tensor, + gemm2_weights: torch.Tensor, + gemm1_scales_linear_fp4_bytes: torch.Tensor, + gemm2_scales_linear_fp4_bytes: torch.Tensor, + hidden_size: int, + intermediate_size: int, + num_experts: int, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Prepare quantized weights for kernel (done offline with weights).""" + from flashinfer import (reorder_rows_for_gated_act_gemm, + shuffle_matrix_a, shuffle_matrix_sf_a) + epilogue_tile_m = 128 # FIXME: this depends on the kernel internals + + # Convert quantized weights to proper formats + gemm1_weights_fp4 = gemm1_weights.view(torch.float8_e4m3fn).reshape( + num_experts, 2 * intermediate_size, hidden_size // 2) # packed fp4 + gemm1_scales_linear_fp4 = gemm1_scales_linear_fp4_bytes.view( + torch.float8_e4m3fn).reshape(num_experts, 2 * intermediate_size, + hidden_size // + 16) # fp8 scaling factors + + gemm2_weights_fp4 = gemm2_weights.view(torch.float8_e4m3fn).reshape( + num_experts, hidden_size, intermediate_size // 2) # packed fp4 + gemm2_scales_linear_fp4 = gemm2_scales_linear_fp4_bytes.view( + torch.float8_e4m3fn).reshape(num_experts, hidden_size, + intermediate_size // + 16) # fp8 scaling factors + + # Reorder rows of W1 and scales for fused gated activation + gemm1_weights_fp4_interleaved = [] + gemm1_scales_fp4_interleaved = [] + for i in range(num_experts): + gemm1_weights_fp4_interleaved.append( + reorder_rows_for_gated_act_gemm(gemm1_weights_fp4[i].clone())) + gemm1_scales_fp4_interleaved.append( + reorder_rows_for_gated_act_gemm( + gemm1_scales_linear_fp4[i].clone())) + + # Stack weights and scales for all experts + gemm1_weights_fp4_interleaved = torch.stack( + gemm1_weights_fp4_interleaved).reshape(num_experts, + 2 * intermediate_size, + hidden_size // 2) + gemm1_scales_fp4_interleaved = torch.stack( + gemm1_scales_fp4_interleaved).reshape(num_experts, + 2 * intermediate_size, + hidden_size // 16) + + # Shuffle weights and scaling factors for transposed mma output + gemm1_weights_fp4_shuffled = [] + gemm1_scales_fp4_shuffled = [] + gemm2_weights_fp4_shuffled = [] + gemm2_scales_fp4_shuffled = [] + for i in range(num_experts): + gemm1_weights_fp4_shuffled.append( + shuffle_matrix_a( + gemm1_weights_fp4_interleaved[i].view(torch.uint8), + epilogue_tile_m)) + gemm1_scales_fp4_shuffled.append( + shuffle_matrix_sf_a( + gemm1_scales_fp4_interleaved[i].view(torch.uint8), + epilogue_tile_m)) + + gemm2_weights_fp4_shuffled.append( + shuffle_matrix_a(gemm2_weights_fp4[i].view(torch.uint8), + epilogue_tile_m)) + gemm2_scales_fp4_shuffled.append( + shuffle_matrix_sf_a( + gemm2_scales_linear_fp4[i].view(torch.uint8), + epilogue_tile_m)) + + # Stack weights for all experts + gemm1_weights_fp4_shuffled = torch.stack(gemm1_weights_fp4_shuffled) + gemm1_scales_fp4_shuffled = ( + torch.stack(gemm1_scales_fp4_shuffled).view( + torch.float8_e4m3fn).reshape(num_experts, + 2 * intermediate_size, + hidden_size // 16)) + + gemm2_weights_fp4_shuffled = torch.stack(gemm2_weights_fp4_shuffled) + gemm2_scales_fp4_shuffled = ( + torch.stack(gemm2_scales_fp4_shuffled).view( + torch.float8_e4m3fn).reshape(num_experts, hidden_size, + intermediate_size // 16)) + return (gemm1_weights_fp4_shuffled, gemm1_scales_fp4_shuffled, + gemm2_weights_fp4_shuffled, gemm2_scales_fp4_shuffled) + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - # GEMM 1 - # The FlashInfer Cutlass fused MoE kernel expects the combined weights - # to be ordered as [w3, w1], unlike the standard [w1, w3] layout. + # GEMM 1 processing gemm1_weight = layer.w13_weight.data gemm1_weight_scale = layer.w13_weight_scale.data - if self.allow_flashinfer_cutlass: + if self.allow_flashinfer: gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1( gemm1_weight, gemm1_weight_scale, dim=-2) @@ -1011,6 +1133,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): layer.w13_weight_scale = Parameter(gemm1_weight_scale, requires_grad=False) + # Common processing for w13_weight_scale_2 if not torch.allclose(layer.w13_weight_scale_2[:, 0], layer.w13_weight_scale_2[:, 1]): logger.warning_once( @@ -1021,26 +1144,18 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, requires_grad=False) + # Common processing for input scales and alphas w13_input_scale = layer.w13_input_scale.max(dim=1).values.to( torch.float32) layer.g1_alphas = Parameter( (w13_input_scale * w13_weight_scale_2).to(torch.float32), requires_grad=False) - assert (layer.w13_weight_scale.shape[2] % 16 == 0), ( - "Expected weight_scale.dim(1) to be divisible by 16") - assert (layer.w13_weight_scale.dtype == torch.float8_e4m3fn), ( - "Weight Blockscale must be represented as FP8-E4M3") - w13_blockscale_swizzled = swizzle_blockscale(layer.w13_weight_scale) - - layer.w13_blockscale_swizzled = Parameter(w13_blockscale_swizzled, - requires_grad=False) - # This is for quantization, so we need to invert it. layer.w13_input_scale_quant = Parameter( (1 / w13_input_scale).to(torch.float32), requires_grad=False) - # GEMM 2 + # GEMM 2 processing layer.g2_alphas = Parameter( (layer.w2_input_scale * layer.w2_weight_scale_2).to(torch.float32), requires_grad=False) @@ -1049,15 +1164,63 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): layer.w2_input_scale_quant = Parameter( (1 / layer.w2_input_scale).to(torch.float32), requires_grad=False) - assert (layer.w2_weight_scale.shape[2] % 16 == 0), ( - "Expected weight_scale.dim(1) to be divisible by 16") - assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), ( - "Weight Blockscale must be represented as FP8-E4M3") - w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale) + # TensorRT-LLM specific processing + if self.allow_flashinfer and \ + self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: + # Prepare static weights for TRT-LLM kernel + (gemm1_weights_fp4_shuffled, gemm1_scales_fp4_shuffled, + gemm2_weights_fp4_shuffled, gemm2_scales_fp4_shuffled + ) = self.prepare_static_weight_layouts_for_trtllm_moe( + layer.w13_weight, + layer.w2_weight, + layer.w13_weight_scale, + layer.w2_weight_scale, + layer.w2_weight.size(-2), # hidden_size + layer.w13_weight.size(-2) // 2, # intermediate_size + layer.w13_weight.size(0), # num_experts + ) - layer.w2_blockscale_swizzled = Parameter(w2_blockscale_swizzled, - requires_grad=False) - layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False) + layer.gemm1_weights_fp4_shuffled = Parameter( + gemm1_weights_fp4_shuffled, requires_grad=False) + layer.gemm2_weights_fp4_shuffled = Parameter( + gemm2_weights_fp4_shuffled, requires_grad=False) + layer.gemm1_scales_fp4_shuffled = Parameter( + gemm1_scales_fp4_shuffled, requires_grad=False) + layer.gemm2_scales_fp4_shuffled = Parameter( + gemm2_scales_fp4_shuffled, requires_grad=False) + + # Additional parameter needed for TRT-LLM + layer.g1_scale_c = Parameter( + (layer.w2_input_scale_quant * layer.g1_alphas).to( + torch.float32), + requires_grad=False, + ) + + # Clean up weights that won't be used by TRT-LLM + del layer.w2_weight + del layer.w2_weight_scale + del layer.w13_weight + del layer.w13_weight_scale + else: + # Non-TRT-LLM processing (Cutlass or non-flashinfer) + assert (layer.w13_weight_scale.shape[2] % 16 == 0), ( + "Expected weight_scale.dim(1) to be divisible by 16") + assert (layer.w13_weight_scale.dtype == torch.float8_e4m3fn), ( + "Weight Blockscale must be represented as FP8-E4M3") + w13_blockscale_swizzled = swizzle_blockscale( + layer.w13_weight_scale) + layer.w13_blockscale_swizzled = Parameter(w13_blockscale_swizzled, + requires_grad=False) + + assert (layer.w2_weight_scale.shape[2] % 16 == 0), ( + "Expected weight_scale.dim(1) to be divisible by 16") + assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), ( + "Weight Blockscale must be represented as FP8-E4M3") + w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale) + layer.w2_blockscale_swizzled = Parameter(w2_blockscale_swizzled, + requires_grad=False) + layer.w2_weight = Parameter(layer.w2_weight.data, + requires_grad=False) if self.use_marlin: prepare_moe_fp4_layer_for_marlin(layer) @@ -1095,6 +1258,60 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): "EPLB not supported for `ModelOptNvFp4FusedMoE` yet.") assert activation == "silu", "Only SiLU activation is supported." + if self.allow_flashinfer and \ + self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: + import flashinfer + + from vllm.model_executor.models.llama4 import Llama4MoE + + a1_gscale = layer.w13_input_scale_quant + (hidden_states_fp4, + hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize( + x, + a1_gscale, + is_sf_swizzled_layout=False, + ) + use_llama4_routing = \ + custom_routing_function is Llama4MoE.custom_routing_function + routing_method_type = flashinfer.RoutingMethodType.DeepSeekV3 + if use_llama4_routing: + routing_method_type = flashinfer.RoutingMethodType.Llama4 + out = flashinfer.fused_moe.trtllm_fp4_block_scale_moe( + routing_logits=router_logits + if use_llama4_routing else router_logits.to(torch.float32), + routing_bias=e_score_correction_bias, + hidden_states=hidden_states_fp4, + hidden_states_scale=hidden_states_scale_linear_fp4.view( + torch.float8_e4m3fn).flatten(), + gemm1_weights=layer.gemm1_weights_fp4_shuffled.data, + gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view( + torch.float8_e4m3fn), + gemm1_bias=None, + gemm1_alpha=None, + gemm1_beta=None, + gemm1_clamp_limit=None, + gemm2_weights=layer.gemm2_weights_fp4_shuffled.data, + gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view( + torch.float8_e4m3fn), + gemm2_bias=None, + output1_scale_scalar=layer.g1_scale_c.data, + output1_scale_gate_scalar=layer.g1_alphas.data, + output2_scale_scalar=layer.g2_alphas.data, + num_experts=global_num_experts, + top_k=top_k, + n_group=num_expert_group, + topk_group=topk_group, + intermediate_size=layer.intermediate_size_per_partition, + local_expert_offset=layer.ep_rank * layer.local_num_experts, + local_num_experts=layer.local_num_experts, + routed_scaling_factor=None, + tile_tokens_dim=_get_tile_tokens_dim(x.shape[0], top_k, + layer.local_num_experts), + routing_method_type=routing_method_type, + do_finalize=True, + )[0] + return out + topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, router_logits=router_logits, @@ -1149,6 +1366,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): expert_map=expert_map, apply_router_weight_on_input=apply_router_weight_on_input) else: + assert self.allow_flashinfer and \ + self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS out = flashinfer_fp4_cutlass_moe_forward( self.fused_experts, layer, @@ -1160,4 +1379,5 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): expert_map=expert_map, apply_router_weight_on_input=apply_router_weight_on_input, ) + return out diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index 4c617e2260..8ef91eeed4 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -126,7 +126,7 @@ def flashinfer_fp4_cutlass_moe_forward( def select_nvfp4_gemm_impl( - allow_flashinfer_cutlass: bool, + allow_flashinfer: bool, moe, # FusedMoEConfig logger): """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers""" @@ -137,8 +137,14 @@ def select_nvfp4_gemm_impl( all2all_manager = get_ep_group().device_communicator.all2all_manager assert all2all_manager is not None - if allow_flashinfer_cutlass: - logger.debug_once("Using FlashInferExperts") + if allow_flashinfer: + flashinfer_backend = envs.VLLM_FLASHINFER_MOE_BACKEND + if flashinfer_backend != "throughput": + raise ValueError( + f"Only throughput backend is supported for FlashInferExperts, " + f"but got {flashinfer_backend}.") + logger.debug_once( + "Initializing FlashInferExperts with throughput backend.") return FlashInferExperts( use_nvfp4_w4a4=True, use_dp=moe.moe_parallel_config.dp_size > 1, diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py index 23a749467f..21af74c6b7 100644 --- a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +++ b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py @@ -21,7 +21,7 @@ class NvFp4Support: """Result container for NV-FP4 capability probing.""" cutlass_supported: bool - allow_flashinfer_cutlass: bool + allow_flashinfer: bool use_marlin: bool @@ -54,6 +54,6 @@ def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support: return NvFp4Support( cutlass_supported=cutlass_supported, - allow_flashinfer_cutlass=allow_flashinfer, + allow_flashinfer=allow_flashinfer, use_marlin=use_marlin, ) diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 32c52612ca..5998d4c312 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -86,6 +86,8 @@ flashinfer_cutlass_fused_moe = _lazy_import_wrapper("flashinfer.fused_moe", fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize") nvfp4_block_scale_interleave = _lazy_import_wrapper( "flashinfer", "nvfp4_block_scale_interleave") +trtllm_fp4_block_scale_moe = _lazy_import_wrapper( + "flashinfer", "trtllm_fp4_block_scale_moe") # Special case for autotune since it returns a context manager autotune = _lazy_import_wrapper( @@ -112,6 +114,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool: ("flashinfer.fused_moe", "cutlass_fused_moe"), ("flashinfer", "fp4_quantize"), ("flashinfer", "nvfp4_block_scale_interleave"), + ("flashinfer.fused_moe", "trtllm_fp4_block_scale_moe"), ] for module_name, attr_name in required_functions: @@ -188,6 +191,7 @@ __all__ = [ "flashinfer_cutlass_fused_moe", "fp4_quantize", "nvfp4_block_scale_interleave", + "trtllm_fp4_block_scale_moe", "autotune", "has_flashinfer_moe", "has_flashinfer_cutlass_fused_moe", From b2c8ce57c68db0764a49d66f048b8a7a5cef9d13 Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Thu, 7 Aug 2025 21:18:25 -0500 Subject: [PATCH 077/932] Fix Flashinfer CUTLASS MOE Allgather (#21963) Signed-off-by: Shu Wang --- .../device_communicators/cuda_communicator.py | 3 +- vllm/forward_context.py | 58 +++++++++++++++++++ .../flashinfer_cutlass_prepare_finalize.py | 24 ++------ vllm/model_executor/layers/fused_moe/layer.py | 13 +++-- 4 files changed, 71 insertions(+), 27 deletions(-) diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index 4ab8f3d938..66d4940c9c 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -236,7 +236,8 @@ class CudaCommunicator(DeviceCommunicatorBase): input_size = input_.size() if sizes is not None: assert len(sizes) == world_size - assert input_.shape[dim] == sizes[self.rank_in_group] + assert input_.shape[dim] == sizes[self.rank_in_group], ( + f"{input_.shape[dim]} != {sizes[self.rank_in_group]}") output_size = (sum(sizes), ) + input_size[1:] else: output_size = (input_size[0] * world_size, ) + input_size[1:] diff --git a/vllm/forward_context.py b/vllm/forward_context.py index dd55b19fee..4686ba24e6 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -26,10 +26,26 @@ batchsize_logging_interval: float = envs.VLLM_LOG_BATCHSIZE_INTERVAL batchsize_forward_time: defaultdict = defaultdict(list) +def _compute_chunked_local_num_tokens(num_tokens_across_dp_cpu: list[int], + max_num_tokens: int, + chunk_idx: int) -> list[int]: + dp_size = len(num_tokens_across_dp_cpu) + + local_size = [-1] * dp_size + for i in range(dp_size): + dp_tokens = num_tokens_across_dp_cpu[i] + local_size[i] = min(max_num_tokens, + dp_tokens - (max_num_tokens * chunk_idx)) + if local_size[i] <= 0: + local_size[i] = 1 # ensure lockstep even if done + return local_size + + @dataclass class DPMetadata: max_tokens_across_dp_cpu: torch.Tensor cu_tokens_across_dp_cpu: torch.Tensor + local_sizes: Optional[list[int]] = None @staticmethod def num_tokens_across_dp(num_tokens: int, dp_size: int, @@ -78,6 +94,48 @@ class DPMetadata: cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_across_dp, dim=0) return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu) + @contextmanager + def chunked_sizes(self, max_chunk_size_per_rank: int, chunk_idx: int): + """ + Context manager to compute and temporarily set the per-rank local token + sizes for a specific chunk during chunked forward execution. + + This is necessary to ensure each DP (data parallel) rank processes its + designated portion of tokens in lockstep with others, even when the + token counts are uneven or some ranks have completed their input early. + + For chunked execution, we break up the total tokens on each rank into + multiple chunks (of at most `max_chunk_size_per_rank`), and for a given + `chunk_idx`, this context manager sets `self.local_sizes` to the number + of tokens to process in that chunk on each rank. + + It uses cumulative sizes (`cu_tokens_across_dp_cpu`) to derive the + number of tokens per rank, and calls `_compute_chunked_local_num_tokens` + to determine the chunk-wise split. + + `self.local_sizes` is only valid inside the context. + + Args: + max_chunk_size_per_rank: The max number of tokens each rank is + allowed to process in this chunk. + chunk_idx: The index of the chunk to compute sizes for. + """ + cu_sizes = self.cu_tokens_across_dp_cpu + num_tokens_across_dp_cpu = [ + (cu_sizes[i] - + cu_sizes[i - 1]).item() if i > 0 else cu_sizes[0].item() + for i in range(len(cu_sizes)) + ] + self.local_sizes = _compute_chunked_local_num_tokens( + num_tokens_across_dp_cpu, max_chunk_size_per_rank, chunk_idx) + try: + yield self.local_sizes + finally: + self.local_sizes = None + + def get_chunk_sizes_across_dp_rank(self) -> Optional[list[int]]: + return self.local_sizes + @dataclass class ForwardContext: diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py index 02e1d1f1fd..7fdb465c45 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py @@ -4,7 +4,6 @@ from typing import Any, Optional import torch -import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.distributed import get_dp_group from vllm.forward_context import get_forward_context @@ -14,20 +13,8 @@ from vllm.model_executor.layers.fused_moe.utils import ( from vllm.utils.flashinfer import nvfp4_block_scale_interleave -def get_local_sizes(local_tokens): - cu_sizes = get_forward_context().dp_metadata.cu_tokens_across_dp_cpu - sizes = [cu_sizes[0].item()] - for i in range(1, len(cu_sizes)): - sizes.append((cu_sizes[i] - cu_sizes[i - 1]).item()) - max_num_tokens = envs.VLLM_MOE_DP_CHUNK_SIZE - sizes_chunked = [max_num_tokens] * len(sizes) - if local_tokens < max_num_tokens: - # When the number of local tokens is less than max_num_tokens, all other - # ranks will also have fewer than max_num_tokens. The remaining tokens - # are accounted for as residual. - sizes_chunked = [x % max_num_tokens for x in sizes] - - return sizes_chunked +def get_local_sizes(): + return get_forward_context().dp_metadata.get_chunk_sizes_across_dp_rank() class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): @@ -90,7 +77,7 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): topk_weights, topk_ids, a1q, a1q_scale = \ get_dp_group().all_gatherv([topk_weights, topk_ids, a1q, a1q_scale], # noqa: E501 dim=0, - sizes=get_local_sizes(local_tokens)) + sizes=get_local_sizes()) a1_m, a1_n = a1q.shape a1q_scale = nvfp4_block_scale_interleave(a1q_scale) @@ -107,8 +94,5 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): ['use_dp', 'local_tokens']) if use_dp: fused_expert_output = get_dp_group().reduce_scatterv( - fused_expert_output, - dim=0, - sizes=get_local_sizes(local_tokens), - ) + fused_expert_output, dim=0, sizes=get_local_sizes()) output.copy_(fused_expert_output) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 76cedb3ed3..272b6ce672 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1570,18 +1570,19 @@ class FusedMoE(torch.nn.Module): max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens num_tokens = full_hidden_states.size(0) - for chunk_start_ in range(0, max_tokens_across_dp, - moe_dp_chunk_size_per_rank): + for chunk_idx, chunk_start_ in enumerate( + range(0, max_tokens_across_dp, moe_dp_chunk_size_per_rank)): chunk_start = chunk_start_ chunk_end = min(chunk_start + moe_dp_chunk_size_per_rank, max_tokens_across_dp) # clamp start and end chunk_start = min(chunk_start, num_tokens - 1) chunk_end = min(chunk_end, num_tokens) - - process_chunk(chunk_start, - chunk_end, - skip_result_store=chunk_start_ >= num_tokens) + with ctx.dp_metadata.chunked_sizes(moe_dp_chunk_size_per_rank, + chunk_idx): + process_chunk(chunk_start, + chunk_end, + skip_result_store=chunk_start_ >= num_tokens) return full_final_hidden_states From 3303f134e03f7a80b42e50065976be9d499c8683 Mon Sep 17 00:00:00 2001 From: Junhao Li Date: Thu, 7 Aug 2025 22:18:28 -0400 Subject: [PATCH 078/932] [Kernel] Add support for block FP8 on SM120 (NVIDIA 5090 and RTX PRO 6000) (#22131) Signed-off-by: Junhao Li --- CMakeLists.txt | 1 + csrc/cutlass_extensions/common.hpp | 10 + .../c3x/scaled_mm_blockwise_sm120_fp8.cu | 23 +++ ...scaled_mm_blockwise_sm120_fp8_dispatch.cuh | 183 ++++++++++++++++++ .../cutlass_w8a8/c3x/scaled_mm_kernels.hpp | 6 + .../cutlass_w8a8/scaled_mm_c3x_sm120.cu | 24 +-- 6 files changed, 229 insertions(+), 18 deletions(-) create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh diff --git a/CMakeLists.txt b/CMakeLists.txt index e2cc0ccdef..093330caa4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -427,6 +427,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu" ) set_gencode_flags_for_srcs( SRCS "${SRCS}" diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp index 195872e8ed..f2c1dcf69f 100644 --- a/csrc/cutlass_extensions/common.hpp +++ b/csrc/cutlass_extensions/common.hpp @@ -60,3 +60,13 @@ struct enable_sm100_only : Kernel { #endif } }; + +template +struct enable_sm120_only : Kernel { + template + CUTLASS_DEVICE void operator()(Args&&... args) { +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1200 + Kernel::operator()(std::forward(args)...); +#endif + } +}; diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu new file mode 100644 index 0000000000..5515374a57 --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu @@ -0,0 +1,23 @@ +#include "scaled_mm_kernels.hpp" +#include "scaled_mm_blockwise_sm120_fp8_dispatch.cuh" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" + +namespace vllm { + +void cutlass_scaled_mm_blockwise_sm120_fp8(torch::Tensor& out, + torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + if (out.dtype() == torch::kBFloat16) { + cutlass_gemm_blockwise_sm120_fp8_dispatch( + out, a, b, a_scales, b_scales); + + } else { + TORCH_CHECK(out.dtype() == torch::kFloat16); + cutlass_gemm_blockwise_sm120_fp8_dispatch( + out, a, b, a_scales, b_scales); + } +} + +} // namespace vllm diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh new file mode 100644 index 0000000000..d50a83ae1c --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh @@ -0,0 +1,183 @@ +#pragma once + +#include "cuda_utils.h" +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" + +#include "cute/tensor.hpp" +#include "cutlass/tensor_ref.h" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" +#include "cutlass/gemm/kernel/tile_scheduler_params.h" +#include "cutlass/epilogue/dispatch_policy.hpp" +#include "cutlass/epilogue/collective/collective_builder.hpp" + +#include "cutlass_extensions/gemm/dispatch_policy.hpp" +#include "cutlass_extensions/gemm/collective/collective_builder.hpp" + +#include "cutlass_gemm_caller.cuh" + +namespace vllm { + +using namespace cute; + +// clang-format off +template +struct cutlass_3x_gemm_fp8_blockwise { + using ElementAB = cutlass::float_e4m3_t; + + using ElementA = ElementAB; + using LayoutA = cutlass::layout::RowMajor; + using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose::type; + static constexpr int AlignmentA = 128 / cutlass::sizeof_bits::value; + + using ElementB = ElementAB; + // ColumnMajor is used for B to match the CUTLASS convention. + using LayoutB = cutlass::layout::ColumnMajor; + using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose::type; + static constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; + + using ElementD = OutType; + using LayoutD = cutlass::layout::RowMajor; + using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose::type; + static constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; + + using ElementC = void; // TODO: support bias + using LayoutC = LayoutD; + using LayoutC_Transpose = LayoutD_Transpose; + static constexpr int AlignmentC = AlignmentD; + + using ElementAccumulator = float; + using ElementCompute = float; + using ElementBlockScale = float; + + using ScaleConfig = cutlass::detail::Sm120BlockwiseScaleConfig< + ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, + cute::UMMA::Major::MN, cute::UMMA::Major::K>; + + // layout_SFA and layout_SFB cannot be swapped since they are deduced. + using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); + using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); + + using ArchTag = cutlass::arch::Sm120; + using OperatorClass = cutlass::arch::OpClassTensorOp; + + static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest; + using ElementScalar = float; + using DefaultOperation = cutlass::epilogue::fusion::LinearCombination; + using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< + ArchTag, + OperatorClass, + MmaTileShape, + ClusterShape, + cutlass::epilogue::collective::EpilogueTileAuto, + ElementAccumulator, + ElementCompute, + ElementC, + LayoutC, + AlignmentC, + ElementD, + LayoutD, + AlignmentD, + EpilogueScheduler, + DefaultOperation + >::CollectiveOp; + + using StageCountType = cutlass::gemm::collective::StageCountAuto; + using CollectiveMainloop = + typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, + OperatorClass, + ElementA, + cute::tuple, + AlignmentA, + ElementB, + cute::tuple, + AlignmentB, + ElementAccumulator, + MmaTileShape, + ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout(sizeof(typename CollectiveEpilogue::SharedStorage))>, + MainloopScheduler + >::CollectiveOp; + + using KernelType = enable_sm120_only, CollectiveMainloop, CollectiveEpilogue>>; + + struct GemmKernel : public KernelType {}; +}; + +template +void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + using GemmKernel = typename Gemm::GemmKernel; + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideD = typename Gemm::GemmKernel::StrideD; + using StrideC = typename Gemm::GemmKernel::StrideC; + using LayoutSFA = typename Gemm::LayoutSFA; + using LayoutSFB = typename Gemm::LayoutSFB; + using ScaleConfig = typename Gemm::ScaleConfig; + + using ElementAB = typename Gemm::ElementAB; + using ElementD = typename Gemm::ElementD; + + int32_t m = a.size(0), n = b.size(1), k = a.size(1); + + StrideA a_stride; + StrideB b_stride; + StrideC c_stride; + a_stride = + cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1)); + b_stride = + cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1)); + c_stride = + cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1)); + + LayoutSFA layout_SFA = + ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1)); + LayoutSFB layout_SFB = + ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1)); + + auto a_ptr = static_cast(a.data_ptr()); + auto b_ptr = static_cast(b.data_ptr()); + auto a_scales_ptr = static_cast(a_scales.data_ptr()); + auto b_scales_ptr = static_cast(b_scales.data_ptr()); + + auto mainloop_args = [&](){ + return typename GemmKernel::MainloopArguments{ + a_ptr, a_stride, b_ptr, b_stride, + a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB + }; + }(); + auto prob_shape = cute::make_shape(m, n, k, 1); + + auto c_ptr = static_cast(out.data_ptr()); + typename GemmKernel::EpilogueArguments epilogue_args{ + {}, c_ptr, c_stride, c_ptr, c_stride}; + c3x::cutlass_gemm_caller(a.device(), prob_shape, mainloop_args, + epilogue_args); +} + +template +void cutlass_gemm_blockwise_sm120_fp8_dispatch(torch::Tensor& out, + torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + // TODO: better heuristics + cutlass_gemm_caller_blockwise, + Shape<_1, _1, _1>, cutlass::epilogue::collective::EpilogueScheduleAuto, + cutlass::gemm::collective::KernelScheduleAuto>>( + out, a, b, a_scales, b_scales); +} + +} // namespace vllm diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp index e049a5f2d2..9ceb3a3ece 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp @@ -47,4 +47,10 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales); + +void cutlass_scaled_mm_blockwise_sm120_fp8(torch::Tensor& out, + torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales); } // namespace vllm diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu index 0c47ab8299..dc87c5c35c 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu @@ -1,11 +1,9 @@ -#include +#include "c3x/scaled_mm_helper.hpp" #include "c3x/scaled_mm_kernels.hpp" -#include "cuda_utils.h" - /* This file defines quantized GEMM operations using the CUTLASS 3.x API, for - NVIDIA GPUs with sm120 (Blackwell Geforce). + NVIDIA GPUs with sm120 (Blackwell). */ #if defined ENABLE_SCALED_MM_SM120 && ENABLE_SCALED_MM_SM120 @@ -15,20 +13,10 @@ void cutlass_scaled_mm_sm120(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, std::optional const& bias) { - TORCH_CHECK(a_scales.dtype() == torch::kFloat32); - TORCH_CHECK(b_scales.dtype() == torch::kFloat32); - - int M = a.size(0), N = b.size(1), K = a.size(1); - TORCH_CHECK( - (a_scales.numel() == 1 || a_scales.numel() == a.size(0)) && - (b_scales.numel() == 1 || b_scales.numel() == b.size(1)), - "Currently, block scaled fp8 gemm is not implemented for Blackwell"); - - // Standard per-tensor/per-token/per-channel scaling - TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); - TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn, - "Currently, only fp8 gemm is implemented for Blackwell"); - vllm::cutlass_scaled_mm_sm120_fp8(c, a, b, a_scales, b_scales, bias); + dispatch_scaled_mm(c, a, b, a_scales, b_scales, bias, + vllm::cutlass_scaled_mm_sm120_fp8, + nullptr, // int8 not supported on SM120 + vllm::cutlass_scaled_mm_blockwise_sm120_fp8); } #endif From 17eaaef59504aa6786cbf89a8d5012d7b64839de Mon Sep 17 00:00:00 2001 From: Chauncey Date: Fri, 8 Aug 2025 10:20:21 +0800 Subject: [PATCH 079/932] [Bugfix] Fix RuntimeError: Index put requires the source and destination dtypes match (#22065) Signed-off-by: chaunceyjiang --- .../test_completion_with_image_embeds.py | 103 ++++++++++++++++++ vllm/model_executor/models/utils.py | 5 +- 2 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 tests/v1/entrypoints/openai/test_completion_with_image_embeds.py diff --git a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py new file mode 100644 index 0000000000..be98be8d14 --- /dev/null +++ b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import base64 +import io +import json + +import openai # use the official client for correctness check +import pytest +import pytest_asyncio +import torch +from transformers import AutoConfig + +from tests.conftest import ImageTestAssets +from tests.utils import RemoteOpenAIServer + +# any model with a chat template should work here +MODEL_NAME = "llava-hf/llava-1.5-7b-hf" +CONFIG = AutoConfig.from_pretrained(MODEL_NAME) +MAXIMUM_IMAGES = 2 + + +@pytest.fixture(scope="module") +def default_image_embeds_server_args() -> list[str]: + return [ + "--dtype", + "bfloat16", + "--max-model-len", + "2048", + "--max-num-seqs", + "4", + "--enforce-eager", + "--limit-mm-per-prompt", + json.dumps({"image": MAXIMUM_IMAGES}), + ] + + +@pytest.fixture(scope="module") +def server_with_image_embeds(default_image_embeds_server_args): + with RemoteOpenAIServer(MODEL_NAME, + default_image_embeds_server_args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client_with_image_embeds(server_with_image_embeds): + async with server_with_image_embeds.get_async_client() as async_client: + yield async_client + + +def encode_image_embedding_to_base64(image_embedding) -> str: + """ + Encode image embedding to base64 string + """ + buffer = io.BytesIO() + torch.save(image_embedding, buffer) + buffer.seek(0) + binary_data = buffer.read() + base64_image_embedding = base64.b64encode(binary_data).decode('utf-8') + return base64_image_embedding + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("dtype", [torch.half, torch.float16, torch.float32]) +async def test_completions_with_image_embeds( + client_with_image_embeds: openai.AsyncOpenAI, + model_name: str, + image_assets: ImageTestAssets, + dtype: torch.dtype, +): + # Test case: Single image embeds input + image_embeds = image_assets[0].image_embeds.to(dtype=dtype) + base64_image_embedding = encode_image_embedding_to_base64(image_embeds) + chat_completion = await client_with_image_embeds.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": + "user", + "content": [ + { + "type": + "text", + "text": + "Describe these images separately. For each image," + "reply with a short sentence (no more than 10 words).", + }, + { + "type": "image_embeds", + "image_embeds": base64_image_embedding, + }, + ], + }, + ], + model=model_name, + ) + assert chat_completion.choices[0].message.content is not None + assert isinstance(chat_completion.choices[0].message.content, str) + assert len(chat_completion.choices[0].message.content) > 0 diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index fecd14dde4..c69df6e616 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -401,7 +401,7 @@ def merge_multimodal_embeddings_from_map( """ flattened_embeddings = _flatten_embeddings(multimodal_embeddings) inputs_embeds[placeholder_map.dest] = flattened_embeddings[ - placeholder_map.src] + placeholder_map.src].to(dtype=inputs_embeds.dtype) return inputs_embeds @@ -421,7 +421,8 @@ def _merge_multimodal_embeddings( flattened = _flatten_embeddings(multimodal_embeddings) try: # This is equivalent to: inputs_embeds[is_multimodal] = flattened. - inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1), flattened) + inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1), + flattened.to(dtype=inputs_embeds.dtype)) except RuntimeError as e: num_expected_tokens = is_multimodal.sum().item() assert isinstance(num_expected_tokens, int) From c152e2a8a0f49edfc06d760f04ff617310384757 Mon Sep 17 00:00:00 2001 From: Yuxuan Zhang <2448370773@qq.com> Date: Fri, 8 Aug 2025 10:37:23 +0800 Subject: [PATCH 080/932] not tie_word_embeddings for glm-4.5 and glm-4.5v (#22460) Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> --- vllm/model_executor/models/glm4_moe.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index bd3e27662e..0053e4e6ff 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -601,8 +601,6 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): quant_config=quant_config) else: self.lm_head = PPMissingLayer() - if self.config.tie_word_embeddings: - self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) From 6f287915d8e4c2c09e7db2eb5cb670036d33f478 Mon Sep 17 00:00:00 2001 From: ZiTian Zhao Date: Fri, 8 Aug 2025 11:18:50 +0800 Subject: [PATCH 081/932] Optimize MiniCPMO mask creation with vectorized implementation (#22464) Signed-off-by: zitian.zhao Signed-off-by: zitian zhao --- vllm/model_executor/models/minicpmo.py | 32 ++++++++++++++++++-------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index 4e4fc3d5c7..fd91c7fcc1 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -587,15 +587,29 @@ class MiniCPMO(MiniCPMV2_6): num_lookhead: int = 0, ) -> torch.Tensor: ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, - 0) - ending = min((i // chunk_size + 1) * chunk_size + num_lookhead, - size) - ret[i, start:ending] = True + # Vectorized computation of row indices and chunk boundaries + row_indices = torch.arange(size, device=device) + chunk_indices = row_indices // chunk_size + if num_left_chunks < 0: + # If num_left_chunks < 0, start is always 0 for all rows + start_indices = torch.zeros_like(row_indices) + else: + # Compute start indices vectorially + start_chunk_indices = torch.clamp(chunk_indices - num_left_chunks, + min=0) + start_indices = start_chunk_indices * chunk_size + # Compute ending indices vectorially + end_chunk_indices = chunk_indices + 1 + end_indices = torch.clamp(end_chunk_indices * chunk_size + + num_lookhead, + max=size) + # Create column indices for broadcasting + col_indices = torch.arange(size, device=device).unsqueeze(0) + row_indices = row_indices.unsqueeze(1) + start_indices = start_indices.unsqueeze(1) + end_indices = end_indices.unsqueeze(1) + # Vectorized mask creation + ret = (col_indices >= start_indices) & (col_indices < end_indices) return ret def _get_feat_extract_output_lengths(self, From 157f9c13687e38b89fdeb20ecdbb75baf8153e0f Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 8 Aug 2025 11:21:54 +0800 Subject: [PATCH 082/932] Fix pre-commit (#22487) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/minicpmo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index fd91c7fcc1..1ee0a94c37 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -589,7 +589,7 @@ class MiniCPMO(MiniCPMV2_6): ret = torch.zeros(size, size, device=device, dtype=torch.bool) # Vectorized computation of row indices and chunk boundaries row_indices = torch.arange(size, device=device) - chunk_indices = row_indices // chunk_size + chunk_indices = row_indices // chunk_size if num_left_chunks < 0: # If num_left_chunks < 0, start is always 0 for all rows start_indices = torch.zeros_like(row_indices) @@ -597,7 +597,7 @@ class MiniCPMO(MiniCPMV2_6): # Compute start indices vectorially start_chunk_indices = torch.clamp(chunk_indices - num_left_chunks, min=0) - start_indices = start_chunk_indices * chunk_size + start_indices = start_chunk_indices * chunk_size # Compute ending indices vectorially end_chunk_indices = chunk_indices + 1 end_indices = torch.clamp(end_chunk_indices * chunk_size + From af473f0a85731c17d9cf708deec3e864e674feb0 Mon Sep 17 00:00:00 2001 From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com> Date: Fri, 8 Aug 2025 11:25:01 +0800 Subject: [PATCH 083/932] [bugfix] Fix Llama3/4 issues caused by FlashInfer 0.2.10 (#22426) Signed-off-by: Po-Han Huang --- .../quantization/utils/flashinfer_utils.py | 22 +++++++++++++------ vllm/v1/attention/backends/flashinfer.py | 3 ++- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index c6f914febc..9fb194767e 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -6,14 +6,22 @@ import torch def calculate_tile_tokens_dim(num_tokens, top_k, num_experts): - from flashinfer import next_positive_power_of_2 - # Guess tokens per expert assuming perfect expert distribution first. - num_tokens_per_expert = (num_tokens * top_k) // num_experts - # And pad the number to the next power of 2. - tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert) - # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel. - tile_tokens_dim = min(max(tile_tokens_dim, 8), 64) + # FlashInfer 0.2.10 has issues with larger tile sizes. Set to 8 for now. + # TODO: Revert this to dynamic calculation once a new version of FlashInfer + # with the necessary kernels is released. + tile_tokens_dim = 8 + + # from flashinfer import next_positive_power_of_2 + + # # Guess tokens per expert assuming perfect expert distribution first. + # num_tokens_per_expert = (num_tokens * top_k) // num_experts + # # And pad the number to the next power of 2. + # tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert) + # # Cap to 8-64 tokens per CTA tile as it's the range supported by the + # # kernel. + # tile_tokens_dim = min(max(tile_tokens_dim, 8), 64) + return tile_tokens_dim diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 1fcb190286..c85d8bce31 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -524,7 +524,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): head_dim = self.kv_cache_spec.head_size # currently prefill trtllm attention does not support fp8 kv cache - prefill_use_trtllm = use_trtllm_attention( + prefill_use_trtllm = not cache_dtype.startswith("fp8") \ + and use_trtllm_attention( num_prefill_tokens, max_seq_len, cache_dtype, num_qo_heads, num_kv_heads, head_dim) decode_use_trtllm = use_trtllm_attention( From 099c0464637f330f8ea38b07fe0694717c16d815 Mon Sep 17 00:00:00 2001 From: iAmir97 <71513472+iAmir97@users.noreply.github.com> Date: Fri, 8 Aug 2025 11:25:18 +0700 Subject: [PATCH 084/932] [Doc] Sleep mode documentation (#22310) Signed-off-by: iAmir97 Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com> Co-authored-by: iAmir97 Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung Co-authored-by: Hong Hanh Co-authored-by: youkaichao --- docs/features/sleep_mode.md | 80 +++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 docs/features/sleep_mode.md diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md new file mode 100644 index 0000000000..5749b02d26 --- /dev/null +++ b/docs/features/sleep_mode.md @@ -0,0 +1,80 @@ +# Sleep Mode + +vLLM's Sleep Mode allows you to temporarily release most GPU memory used by a model, including model weights and KV cache, without stopping the server or unloading the Docker container. This is especially useful for RLHF, training, or cost-saving scenarios where GPU resources need to be freed between inference workloads. + +Key benefits: + +- **Frees GPU memory**: Offloads model weights to CPU RAM and discards KV cache, releasing up to 90%+ of GPU memory for other tasks. +- **Fast resume**: Quickly wake up the engine and resume inference without full model reload. +- **API endpoints**: Control sleep/wake_up state via HTTP endpoints or Python API. +- **Supports distributed workloads**: Works with tensor parallelism, pipeline parallelism, etc. +- **Fine-grained control**: Optionally wake up only model weights or KV cache to avoid OOM during weight updates. + +!!! note + This feature is only supported on CUDA platform. + +## Sleep levels + +Level 1 sleep will offload the model weights and discard the KV cache. The content of KV cache is forgotten. Level 1 sleep is good for sleeping and waking up the engine to run the same model again. The model weights are backed up in CPU memory. Please make sure there's enough CPU memory to store the model weights. Level 2 sleep will discard both the model weights and the KV cache (while the model's buffers are kept in CPU, like rope scaling tensors). The content of both the model weights and KV cache is forgotten. Level 2 sleep is good for sleeping and waking up the engine to run a different model or update the model, where previous model weights are not needed, e.g. RLHF weight update. + +## Usage + +### Offline inference + +Enable sleep mode by passing `enable_sleep_mode=True` to the `LLM` class. + +```python +from vllm import LLM +llm = LLM("Qwen/Qwen3-0.6B", enable_sleep_mode=True) +``` + +#### Python API + +```python +# Put the engine to sleep (level=1: offload weights to CPU RAM, discard KV cache) +llm.sleep(level=1) + +# Wake up the engine (restore weights) +llm.wake_up() +``` + +#### RLHF weight updates + +During RLHF training, vLLM allows you to selectively wake up only the model weights or the KV cache using the tags argument in wake_up(). This fine-grained control is especially useful when updating model weights: by waking up just the weights (e.g., llm.wake_up(tags=["weights"])), you avoid allocating memory for the KV cache until after the weight update is complete. This approach helps prevent GPU out-of-memory (OOM) errors, particularly with large models, by minimizing peak memory usage during weight synchronization and update operations. + +Use `tags=["weights"]` or `tags=["kv_cache"]` to control which resources are restored, useful for RLHF and weight updates. **Note** that `is_sleeping` will report `true` until all components are awake. + +```python +# Put engine to deep sleep (level=2) +llm.sleep(level=2) +# ... Get the new weights +# Wake up only weights to avoid OOM +llm.wake_up(tags=["weights"]) +# ... Update the weights +# wake up KV cache after weights are updated +llm.wake_up(tags=["kv_cache"]) +``` + +### Online Serving + +To enable sleep mode in a vLLM server you need to initialize it with the flag `VLLM_SERVER_DEV_MODE=1` and pass `--enable-sleep-mode` to the vLLM server. + +#### Server in development mode + +When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users. + +```bash +VLLM_SERVER_DEV_MODE=1 python -m vllm.entrypoints.openai.api_server \ + --model Qwen/Qwen3-0.6B \ + --enable-sleep-mode \ + --port 8000 +``` + +#### HTTP endpoints + +- `POST /sleep?level=1` — Put the model to sleep (`level=1`). +- `POST /wake_up` — Wake up the model. Supports optional `tags` query parameters for partial wake-up (e.g., `?tags=weights`). +- `GET /is_sleeping` — Check if the model is sleeping. + +!!! note + These endpoints are only available when passing `VLLM_SERVER_DEV_MODE=1`. From 808a7b69df479b6b3a16181711cac7ca28a9b941 Mon Sep 17 00:00:00 2001 From: lkchen Date: Thu, 7 Aug 2025 23:15:50 -0700 Subject: [PATCH 085/932] [bench] Fix benchmark/serve.py to ignore unavailable results (#22382) Signed-off-by: Linkun --- vllm/benchmarks/serve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 6d52b51a9f..7cdf87cb4c 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -665,7 +665,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace, pt_records = convert_to_pytorch_benchmark_format( args=args, metrics={k: [results[k]] - for k in metrics}, + for k in metrics if k in results}, extra_info={ k: results[k] for k in results if k not in metrics and k not in ignored_metrics From 1712543df6d0ebdc2cc9649e246ae983c92dabd3 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 8 Aug 2025 15:31:19 +0800 Subject: [PATCH 086/932] [CI/Build] Fix multimodal tests (#22491) Signed-off-by: DarkLight1337 --- vllm/engine/llm_engine.py | 3 ++- vllm/multimodal/registry.py | 25 +++++++++++++------------ vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/llm_engine.py | 2 +- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 79255b031e..3fc4f6445d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -845,7 +845,8 @@ class LLMEngine: def reset_mm_cache(self) -> bool: """Reset the multi-modal cache.""" - return self.input_preprocessor.mm_registry.reset_processor_cache() + return self.input_preprocessor.mm_registry.reset_processor_cache( + self.model_config) def reset_prefix_cache(self, device: Optional[Device] = None) -> bool: """Reset prefix cache for all devices.""" diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index dca04e9a1e..565d54e1a2 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping from dataclasses import dataclass +from functools import lru_cache from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar import torch.nn as nn @@ -86,6 +87,13 @@ class _ProcessorFactories(Generic[_I]): return self.processor(info, dummy_inputs_builder, cache=cache) +# Make sure a different cache is used for each model config +# NOTE: ModelConfig is not hashable so it cannot be passed directly +@lru_cache(maxsize=1) +def _get_processor_cache(model_id: str, capacity_gb: int): + return ProcessingCache(capacity_gb) if capacity_gb > 0 else None + + class MultiModalRegistry: """ A registry that dispatches data processing according to the model. @@ -95,22 +103,15 @@ class MultiModalRegistry: self._processor_factories = ClassRegistry[nn.Module, _ProcessorFactories]() - self._processor_cache: Optional[ProcessingCache] = None - def _get_processor_cache(self, model_config: "ModelConfig"): + model_id = model_config.model capacity_gb = model_config.mm_processor_cache_gb - if capacity_gb is None: - return None # Overrides `disable_cache` argument + return _get_processor_cache(model_id, capacity_gb) - if self._processor_cache is None: - self._processor_cache = ProcessingCache(capacity_gb) - - return self._processor_cache - - def reset_processor_cache(self) -> bool: + def reset_processor_cache(self, model_config: "ModelConfig") -> bool: """Reset the multi-modal processing cache.""" - if self._processor_cache: - self._processor_cache.reset() + if processor_cache := self._get_processor_cache(model_config): + processor_cache.reset() return True # Success diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 45f450291a..7b4ed90fd1 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -566,7 +566,7 @@ class AsyncLLM(EngineClient): await self.engine_core.profile_async(False) async def reset_mm_cache(self) -> None: - self.processor.mm_registry.reset_processor_cache() + self.processor.mm_registry.reset_processor_cache(self.model_config) self.processor.mm_input_cache_client.reset() await self.engine_core.reset_mm_cache_async() diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index efbdffbc09..5a00a93095 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -271,7 +271,7 @@ class LLMEngine: self.engine_core.profile(False) def reset_mm_cache(self): - self.processor.mm_registry.reset_processor_cache() + self.processor.mm_registry.reset_processor_cache(self.model_config) self.processor.mm_input_cache_client.reset() self.engine_core.reset_mm_cache() From 43c4f3d77c3c03f67385201e1b1725a6ba6bcc7a Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 8 Aug 2025 16:11:54 +0800 Subject: [PATCH 087/932] [Misc] Begin deprecation of `get_tensor_model_*_group` (#22494) Signed-off-by: DarkLight1337 --- tests/distributed/test_custom_all_reduce.py | 5 ++--- tests/distributed/test_quick_all_reduce.py | 5 ++--- vllm/distributed/parallel_state.py | 16 ++++++++++++---- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index fae49c41d5..9212c04dee 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -10,8 +10,7 @@ import torch.distributed as dist from vllm.distributed.communication_op import ( # noqa tensor_model_parallel_all_reduce) -from vllm.distributed.parallel_state import (get_tensor_model_parallel_group, - get_tp_group, graph_capture) +from vllm.distributed.parallel_state import get_tp_group, graph_capture from ..utils import (ensure_model_parallel_initialized, init_test_distributed_environment, multi_process_parallel) @@ -37,7 +36,7 @@ def graph_allreduce( init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) ensure_model_parallel_initialized(tp_size, pp_size) - group = get_tensor_model_parallel_group().device_group + group = get_tp_group().device_group # A small all_reduce for warmup. # this is needed because device communicators might be created lazily diff --git a/tests/distributed/test_quick_all_reduce.py b/tests/distributed/test_quick_all_reduce.py index a4added291..6245ccbeca 100644 --- a/tests/distributed/test_quick_all_reduce.py +++ b/tests/distributed/test_quick_all_reduce.py @@ -10,8 +10,7 @@ import torch.distributed as dist from vllm.distributed.communication_op import ( # noqa tensor_model_parallel_all_reduce) -from vllm.distributed.parallel_state import (get_tensor_model_parallel_group, - get_tp_group, graph_capture) +from vllm.distributed.parallel_state import get_tp_group, graph_capture from vllm.platforms import current_platform from ..utils import (ensure_model_parallel_initialized, @@ -42,7 +41,7 @@ def graph_quickreduce( init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) ensure_model_parallel_initialized(tp_size, pp_size) - group = get_tensor_model_parallel_group().device_group + group = get_tp_group().device_group # A small all_reduce for warmup. # this is needed because device communicators might be created lazily diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 6c25cdcfb7..0b3993ca02 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -36,6 +36,7 @@ from unittest.mock import patch import torch import torch.distributed from torch.distributed import Backend, ProcessGroup +from typing_extensions import deprecated import vllm.envs as envs from vllm.distributed.device_communicators.base_device_communicator import ( @@ -894,8 +895,12 @@ def get_tp_group() -> GroupCoordinator: return _TP -# kept for backward compatibility -get_tensor_model_parallel_group = get_tp_group +@deprecated("`get_tensor_model_parallel_group` has been replaced with " + "`get_tp_group` and may be removed after v0.12. Please use " + "`get_tp_group` instead.") +def get_tensor_model_parallel_group(): + return get_tp_group() + _PP: Optional[GroupCoordinator] = None @@ -921,8 +926,11 @@ def get_pp_group() -> GroupCoordinator: return _PP -# kept for backward compatibility -get_pipeline_model_parallel_group = get_pp_group +@deprecated("`get_pipeline_model_parallel_group` has been replaced with " + "`get_pp_group` and may be removed in v0.12. Please use " + "`get_pp_group` instead.") +def get_pipeline_model_parallel_group(): + return get_pp_group() @contextmanager From 904063907c141fe59c2302afe5bc94cbb53c0de6 Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Fri, 8 Aug 2025 16:12:54 +0800 Subject: [PATCH 088/932] [Misc] fix openai version (#22485) Signed-off-by: rongfu.leng --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index 5c422500e1..1a8fea0dd7 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -12,7 +12,7 @@ tokenizers >= 0.21.1 # Required for fast incremental detokenization. protobuf # Required by LlamaTokenizer. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. aiohttp -openai >= 1.98.0 # For Responses API with reasoning content +openai >= 1.99.1 # For Responses API with reasoning content pydantic >= 2.10 prometheus_client >= 0.18.0 pillow # Required for image processing From ccdae737a0c947467488c05f61537e5658fe5064 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 8 Aug 2025 01:13:18 -0700 Subject: [PATCH 089/932] [BugFix] Don't cancel asyncio tasks directly from destructors (#22476) Signed-off-by: Nick Hill --- vllm/utils/__init__.py | 23 +++++++++++++++++------ vllm/v1/engine/async_llm.py | 5 ++--- vllm/v1/engine/core_client.py | 9 ++++----- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index ce62282c21..6d82714f3c 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -687,19 +687,30 @@ class AsyncMicrobatchTokenizer: max_length = kwargs.get("max_length") if not truncation: - return ("encode", add_special_tokens, False, None) + return "encode", add_special_tokens, False, None model_max = getattr(self.tokenizer, "model_max_length", None) if max_length is None or (model_max is not None and max_length == model_max): - return ("encode", add_special_tokens, True, "model_max") + return "encode", add_special_tokens, True, "model_max" - return ("encode", "other") + return "encode", "other" def __del__(self): - for task in self._batcher_tasks: - if not task.done(): - task.cancel() + if ((tasks := getattr(self, "_batcher_tasks", None)) + and (loop := getattr(self, "_loop", None)) + and not loop.is_closed()): + + def cancel_tasks(): + for task in tasks: + task.cancel() + + loop.call_soon_threadsafe(cancel_tasks) + + +def cancel_task_threadsafe(task: Task): + if task and not task.done() and not (loop := task.get_loop()).is_closed(): + loop.call_soon_threadsafe(task.cancel) def make_async( diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 7b4ed90fd1..a270632791 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -27,7 +27,7 @@ from vllm.transformers_utils.config import ( from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device, cdiv, deprecate_kwargs +from vllm.utils import Device, cancel_task_threadsafe, cdiv, deprecate_kwargs from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError @@ -219,8 +219,7 @@ class AsyncLLM(EngineClient): if engine_core := getattr(self, "engine_core", None): engine_core.shutdown() - if handler := getattr(self, "output_handler", None): - handler.cancel() + cancel_task_threadsafe(getattr(self, "output_handler", None)) async def get_supported_tasks(self) -> tuple[SupportedTask, ...]: return await self.engine_core.get_supported_tasks_async() diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 4d30bb6b74..05b4d72608 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -23,7 +23,8 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.tasks import SupportedTask -from vllm.utils import get_open_port, get_open_zmq_inproc_path, make_zmq_socket +from vllm.utils import (cancel_task_threadsafe, get_open_port, + get_open_zmq_inproc_path, make_zmq_socket) from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, EngineCoreRequestType, ReconfigureDistributedRequest, ReconfigureRankType, @@ -342,10 +343,8 @@ class BackgroundResources: if self.coordinator is not None: self.coordinator.close() - if self.output_queue_task is not None: - self.output_queue_task.cancel() - if self.stats_update_task is not None: - self.stats_update_task.cancel() + cancel_task_threadsafe(self.output_queue_task) + cancel_task_threadsafe(self.stats_update_task) # ZMQ context termination can hang if the sockets # aren't explicitly closed first. From 7be7f3824a2d610299991ceefb1b034b3a923b0f Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 8 Aug 2025 11:02:51 +0100 Subject: [PATCH 090/932] [Docs] Improve API docs (+small tweaks) (#22459) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/.nav.yml | 5 ++--- docs/api/{README.md => summary.md} | 0 docs/features/quantization/inc.md | 5 +---- docs/mkdocs/hooks/generate_examples.py | 2 +- mkdocs.yaml | 6 +----- 5 files changed, 5 insertions(+), 13 deletions(-) rename docs/api/{README.md => summary.md} (100%) diff --git a/docs/.nav.yml b/docs/.nav.yml index ad742be3d6..77342e2674 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -58,10 +58,9 @@ nav: - CI: contributing/ci - Design Documents: design - API Reference: - - Summary: api/README.md + - Summary: api/summary.md - Contents: - - glob: api/vllm/* - preserve_directory_names: true + - api/vllm/* - CLI Reference: - Summary: cli/README.md - Community: diff --git a/docs/api/README.md b/docs/api/summary.md similarity index 100% rename from docs/api/README.md rename to docs/api/summary.md diff --git a/docs/features/quantization/inc.md b/docs/features/quantization/inc.md index d97a462f54..13b151bc7f 100644 --- a/docs/features/quantization/inc.md +++ b/docs/features/quantization/inc.md @@ -1,7 +1,4 @@ ---- -title: FP8 INC ---- -[](){ #inc } +# FP8 INC vLLM supports FP8 (8-bit floating point) weight and activation quantization using Intel® Neural Compressor (INC) on Intel® Gaudi® 2 and Intel® Gaudi® 3 AI accelerators. Currently, quantization is validated only in Llama models. diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py index 0ee52bb346..6b4c5b3107 100644 --- a/docs/mkdocs/hooks/generate_examples.py +++ b/docs/mkdocs/hooks/generate_examples.py @@ -105,7 +105,7 @@ class Example: return fix_case(self.path.stem.replace("_", " ").title()) def generate(self) -> str: - content = f"---\ntitle: {self.title}\n---\n\n" + content = f"# {self.title}\n\n" content += f"Source .\n\n" # Use long code fence to avoid issues with diff --git a/mkdocs.yaml b/mkdocs.yaml index e5b7454003..3a64888fb4 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -40,6 +40,7 @@ theme: - navigation.sections - navigation.prune - navigation.top + - navigation.indexes - search.highlight - search.share - toc.follow @@ -51,11 +52,6 @@ hooks: - docs/mkdocs/hooks/generate_argparse.py - docs/mkdocs/hooks/url_schemes.py -# Required to stop api-autonav from raising an error -# https://github.com/tlambert03/mkdocs-api-autonav/issues/16 -nav: - - api - plugins: - meta - search From e5ebeeba531755a78f68413e88a23d061404f3e3 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 8 Aug 2025 11:06:46 +0100 Subject: [PATCH 091/932] Remove exception for Python 3.8 typing from linter (#22506) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- pyproject.toml | 2 -- vllm/utils/__init__.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dfad5d2cdf..03a32ac0ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,8 +73,6 @@ line-length = 80 "vllm/engine/**/*.py" = ["UP006", "UP035"] "vllm/executor/**/*.py" = ["UP006", "UP035"] "vllm/worker/**/*.py" = ["UP006", "UP035"] -# Python 3.8 typing - skip utils for ROCm -"vllm/utils/__init__.py" = ["UP006", "UP035"] [tool.ruff.lint] select = [ diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 6d82714f3c..e39cdf76dc 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -47,7 +47,7 @@ from dataclasses import dataclass, field from functools import cache, lru_cache, partial, wraps from types import MappingProxyType from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple, - Optional, TextIO, Tuple, TypeVar, Union, cast, overload) + Optional, TextIO, TypeVar, Union, cast, overload) from urllib.parse import urlparse from uuid import uuid4 @@ -861,7 +861,7 @@ def is_valid_ipv6_address(address: str) -> bool: return False -def split_host_port(host_port: str) -> Tuple[str, int]: +def split_host_port(host_port: str) -> tuple[str, int]: # ipv6 if host_port.startswith('['): host, port = host_port.rsplit(']', 1) From e789cad6b8b5d2a01aa6521b9208bb8d6501ee5b Mon Sep 17 00:00:00 2001 From: Yongye Zhu Date: Fri, 8 Aug 2025 08:24:07 -0700 Subject: [PATCH 092/932] [gpt-oss] triton kernel mxfp4 (#22421) Signed-off-by: Signed-off-by: Yongye Zhu --- .gitignore | 3 + .../moe/test_gpt_oss_triton_kernels.py | 375 ++++++++++++++++++ .../fused_moe/gpt_oss_triton_kernels_moe.py | 230 +++++++++++ vllm/model_executor/layers/fused_moe/layer.py | 17 +- .../layers/quantization/mxfp4.py | 66 ++- .../layers/quantization/utils/mxfp4_utils.py | 46 ++- vllm/model_executor/layers/utils.py | 21 + vllm/utils/__init__.py | 6 + 8 files changed, 755 insertions(+), 9 deletions(-) create mode 100644 tests/kernels/moe/test_gpt_oss_triton_kernels.py create mode 100644 vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py diff --git a/.gitignore b/.gitignore index 96b97a552c..5dc0f04b6f 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,9 @@ # vllm-flash-attn built from source vllm/vllm_flash_attn/* +# triton jit +.triton + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py new file mode 100644 index 0000000000..3f9b32ce5a --- /dev/null +++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py @@ -0,0 +1,375 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass, fields + +import pytest +import torch +import torch.nn.functional as F +import triton_kernels.swiglu +from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig +from triton_kernels.numerics import InFlexData +from triton_kernels.numerics_details.mxfp import (downcast_to_mxfp, + upcast_from_mxfp) +from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor +from triton_kernels.tensor_details import layout +from triton_kernels.testing import assert_close + +from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( + BatchedPrepareAndFinalize) +from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk +from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( + BatchedOAITritonExperts, triton_kernel_moe_forward) +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEModularKernel) +from vllm.model_executor.layers.utils import shuffle_weight +from vllm.utils import round_up + + +def deshuffle(w: torch.Tensor): + first = w[..., ::2] + second = w[..., 1::2] + + deshuffled = torch.concat((first, second), dim=-1) + return deshuffled + + +def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int): + randbits = [torch.randperm(E) for _ in range(M)] + x_list = [ + (-1)**i * + ((16384 + + ((i * 512) % 4096) + bits).to(torch.int16).view(torch.bfloat16)) + for i, bits in enumerate(randbits) + ] + exp_data = torch.stack(x_list).to( + device="cuda") # simulating gate_output (M, E) + + # create input tensor + x = torch.randn((M, K), dtype=torch.bfloat16, device="cuda") + w1 = torch.randn((E, 2 * N, K), dtype=torch.bfloat16, device="cuda") + w1_bias = torch.randn((E, 2 * N), dtype=torch.bfloat16, device="cuda") + + w2 = torch.randn((E, K, N), dtype=torch.bfloat16, device="cuda") + w2_bias = torch.randn((E, K), dtype=torch.bfloat16, device="cuda") + + exp_data_tri = exp_data.clone() + x_tri = x.clone() + w1_tri = w1.clone() + w2_tri = w2.clone() + + w1_bias_tri = w1_bias.clone() + w2_bias_tri = w2_bias.clone() + w1_bias_tri = w1_bias_tri.to(torch.float32) + w2_bias_tri = w2_bias_tri.to(torch.float32) + + dtype_dict = { + "bf16": torch.bfloat16, + "fp8_e4m3": torch.float8_e4m3fn, + "fp8_e5m2": torch.float8_e5m2 + } + + x = x.to(dtype_dict[a_dtype]).to(torch.bfloat16) + if w_dtype != "mx4": + # simulate quantization support on reference impl + w1 = w1.to(dtype_dict[w_dtype]).to(torch.bfloat16) + w2 = w2.to(dtype_dict[w_dtype]).to(torch.bfloat16) + + # triton moe kernel use transposed shape for matmul + w1_tri = w1_tri.transpose(-2, -1) + w2_tri = w2_tri.transpose(-2, -1) + + # shuffle weights + w1_tri = shuffle_weight(w1_tri) + w1_bias_tri = shuffle_weight(w1_bias_tri) + + # quant triton_weights + x_tri = x.to(dtype_dict[a_dtype]) + if w_dtype != "mx4": + pytest.skip("NYI") + else: # quantize to mx4 + # careful on the padding here, the activation padding need to be + # multiple of 64, the actual engine is not implemented + w1_bottom_pad = round_up(w1_tri.shape[1], 64) - w1_tri.shape[1] + w1_right_pad = round_up(w1_tri.shape[2], 128) - w1_tri.shape[2] + + w2_bottom_pad = w1_right_pad // 2 + w2_right_pad = w1_bottom_pad + + x_pad = w1_bottom_pad + + w1_tri = F.pad(w1_tri, (0, w1_right_pad, 0, w1_bottom_pad, 0, 0), + mode="constant", + value=0) + w2_tri = F.pad(w2_tri, (0, w2_right_pad, 0, w2_bottom_pad, 0, 0), + mode="constant", + value=0) + + w1_bias_tri = F.pad(w1_bias_tri, (0, w1_right_pad, 0, 0), + mode="constant", + value=0) + w2_bias_tri = F.pad(w2_bias_tri, (0, w2_right_pad, 0, 0), + mode="constant", + value=0) + + x_tri = F.pad(x_tri, (0, x_pad, 0, 0), mode="constant", value=0) + + w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout( + mx_axis=1) + w_scale_layout, w_scale_layout_opts = ( + layout.make_default_matmul_mxfp4_w_scale_layout( + mx_axis=1, num_warps=num_warps)) + + w1_tri, w1_scale_tri = downcast_to_mxfp(w1_tri, torch.uint8, axis=1) + w1 = upcast_from_mxfp(w1_tri, w1_scale_tri, torch.bfloat16, axis=1) + + w2_tri, w2_scale_tri = downcast_to_mxfp(w2_tri, torch.uint8, axis=1) + w2 = upcast_from_mxfp(w2_tri, w2_scale_tri, torch.bfloat16, axis=1) + + w1_tri = convert_layout(wrap_torch_tensor(w1_tri, FP4), w_layout, + **w_layout_opts) + w1_scale_tri = convert_layout(wrap_torch_tensor(w1_scale_tri), + w_scale_layout, **w_scale_layout_opts) + + w2_tri = convert_layout(wrap_torch_tensor(w2_tri, FP4), w_layout, + **w_layout_opts) + w2_scale_tri = convert_layout(wrap_torch_tensor(w2_scale_tri), + w_scale_layout, **w_scale_layout_opts) + + pc1 = PrecisionConfig(weight_scale=w1_scale_tri, + flex_ctx=FlexCtx(rhs_data=InFlexData())) + pc2 = PrecisionConfig(weight_scale=w2_scale_tri, + flex_ctx=FlexCtx(rhs_data=InFlexData())) + + # tucuate so the rest can run properly + w1 = w1[..., :K, :2 * N] + w2 = w2[..., :N, :K] + + w1 = deshuffle(w1) + + w1 = w1.transpose(-1, -2).contiguous() + w2 = w2.transpose(-1, -2).contiguous() + + return (x, w1, w1_bias, w2, w2_bias, exp_data, x_tri, w1_tri, w2_tri, + exp_data_tri, w1_bias_tri, w2_bias_tri, pc1, pc2) + + +@dataclass +class ModelConfig: + num_hidden_layers: int = 36 + num_experts: int = 128 + experts_per_token: int = 4 + vocab_size: int = 201088 + hidden_size: int = 2880 + intermediate_size: int = 2880 + head_dim: int = 64 + num_attention_heads: int = 64 + num_key_value_heads: int = 8 + sliding_window: int = 128 + initial_context_length: int = 4096 + rope_theta: float = 150000.0 + rope_scaling_factor: float = 32.0 + rope_ntk_alpha: float = 1.0 + rope_ntk_beta: float = 32.0 + + +def swiglu(x, alpha: float = 1.702, limit: float = 1.0): + # Note we add an extra bias of 1 to the linear layer + x_glu, x_linear = torch.chunk(x, 2, dim=-1) + if limit is not None: + x_glu = x_glu.clamp(max=limit) + out_glu = x_glu * torch.sigmoid(alpha * x_glu) + if limit is not None: + x_linear = x_linear.clamp(min=-limit, max=limit) + return out_glu * (x_linear + 1) + + +def oai_moe_forward( + hidden_states: torch.Tensor, # (M, K) + w1: torch.Tensor, # (E, 2N) + w1_bias: torch.Tensor, # (E, 2N, K) + w2: torch.Tensor, # (E, K, N) + w2_bias: torch.Tensor, # (E, N) + gating_output: torch.Tensor, # (M, E) + topk: int): + # model.py 309:330, assuming gating and norm + t = hidden_states + experts = torch.topk(gating_output, k=topk, dim=-1, sorted=True) + expert_weights = torch.nn.functional.softmax(experts.values, dim=1) + expert_indices = experts.indices + + # MLP #1 + mlp1_weight = w1[expert_indices, ...] + mlp1_bias = w1_bias[expert_indices, ...] + t = torch.einsum("beck,bk->bec", mlp1_weight, t) + mlp1_bias + t = swiglu(t, limit=7) + + # MLP #2 + mlp2_weight = w2[expert_indices, ...] + mlp2_bias = w2_bias[expert_indices, ...] + t = torch.einsum("beck,bek->bec", mlp2_weight, t) + t += mlp2_bias + + # Weighted sum of experts + t = torch.einsum("bec,be->bc", t, expert_weights) + + return t + + +@dataclass +class Case: + a_dtype: str + w_dtype: str + + +@pytest.mark.parametrize( + ", ".join(f.name for f in fields(Case)), + [ + tuple(getattr(case, f.name) for f in fields(Case)) for case in [ + # Case(a_dtype="bf16", w_dtype="bf16"), + # Case(a_dtype="fp8_e4m3", w_dtype="fp8_e5m2"), + Case(a_dtype="bf16", w_dtype="mx4") + ] + ], +) +@pytest.mark.parametrize("num_token", [2]) +@pytest.mark.parametrize("tp", [1, 2, 4, 8]) +def test_equiv(num_token, a_dtype, w_dtype, tp): + M = num_token + E = ModelConfig.num_experts + K = ModelConfig.hidden_size + N = ModelConfig.intermediate_size // tp + topk = ModelConfig.experts_per_token + + x, w1, w1_bias, w2, w2_bias, exp_data, \ + x_tri, w1_tri, w2_tri, exp_data_tri, w1_bias_tri,\ + w2_bias_tri, pc1, pc2 = init_compute_data( + M, K, N, E, a_dtype, w_dtype, num_warps=8) + + out_triton_monolithic = triton_kernel_moe_forward( + hidden_states=x_tri, + w1=w1_tri, + w2=w2_tri, + gating_output=exp_data_tri, + topk=topk, + renormalize=True, + w1_bias=w1_bias_tri, + w2_bias=w2_bias_tri, + w1_precision=pc1, + w2_precision=pc2) + out_triton_monolithic = out_triton_monolithic[..., :K] + + out_ref = oai_moe_forward(hidden_states=x, + w1=w1, + w1_bias=w1_bias, + w2=w2, + w2_bias=w2_bias, + gating_output=exp_data, + topk=topk) + assert_close(ref=out_ref, + tri=out_triton_monolithic, + maxtol=0.025, + rmstol=0.005) + + +def batched_moe(a: torch.Tensor, w1, w2, gating_output: torch.Tensor, + topk: int, renormalize: bool, w1_bias: torch.Tensor, + w2_bias: torch.Tensor, w1_precision: PrecisionConfig, + w2_precision: PrecisionConfig) -> torch.Tensor: + max_num_tokens = round_up(a.shape[0], 64) + + fused_experts = FusedMoEModularKernel( + BatchedPrepareAndFinalize(max_num_tokens, + num_dispatchers=1, + num_local_experts=w1.shape[0], + rank=0), + BatchedOAITritonExperts( + None, + max_num_tokens=max_num_tokens, + num_dispatchers=1, + w1_precision=w1_precision, + w2_precision=w2_precision, + ), + ) + + extra_expert_args = { + "w1_bias": w1_bias, + "w2_bias": w2_bias, + } + + topk_weight, topk_ids, _ = fused_topk(a, gating_output, topk, renormalize) + + return fused_experts( + a, + w1, + w2, + topk_weight, + topk_ids, + extra_expert_args=extra_expert_args, + ) + + +@pytest.mark.parametrize( + ", ".join(f.name for f in fields(Case)), + [ + tuple(getattr(case, f.name) for f in fields(Case)) for case in [ + # Case(a_dtype="bf16", w_dtype="bf16"), + # Case(a_dtype="fp8_e4m3", w_dtype="fp8_e5m2"), + Case(a_dtype="bf16", w_dtype="mx4") + ] + ], +) +@pytest.mark.parametrize("num_token", [64]) +@pytest.mark.parametrize("ep", [1, 2, 4, 8]) +def test_triton_kernel_batched_moe(num_token, a_dtype, w_dtype, ep): + M = num_token + E = ModelConfig.num_experts // ep + K = ModelConfig.hidden_size + N = ModelConfig.intermediate_size + topk = ModelConfig.experts_per_token + + x, w1, w1_bias, w2, w2_bias, exp_data, \ + x_tri, w1_tri, w2_tri, exp_data_tri, w1_bias_tri, \ + w2_bias_tri, pc1, pc2 = init_compute_data( + M, K, N, E, a_dtype, w_dtype, num_warps=4) + + out_tri = batched_moe(a=x_tri, + w1=w1_tri, + w2=w2_tri, + gating_output=exp_data_tri, + topk=topk, + renormalize=True, + w1_bias=w1_bias_tri, + w2_bias=w2_bias_tri, + w1_precision=pc1, + w2_precision=pc2) + out_tri = out_tri[..., :K] + + out_ref = oai_moe_forward(hidden_states=x, + w1=w1, + w1_bias=w1_bias, + w2=w2, + w2_bias=w2_bias, + gating_output=exp_data, + topk=topk) + assert_close(ref=out_ref, tri=out_tri, maxtol=0.025, rmstol=0.005) + + +def test_unit_shuffle(): + N = ModelConfig.intermediate_size + K = ModelConfig.hidden_size + m = torch.randn((K, 2 * N), dtype=torch.bfloat16, device="cuda") + + x = torch.randn(K, dtype=torch.bfloat16, device="cuda") + + m_shuffled = shuffle_weight(m) + + out_ref = x @ m + out_ref = swiglu(out_ref, limit=1.0) + + out = x @ m_shuffled + out = triton_kernels.swiglu.swiglu_torch( + out, + alpha=1.702, + precision_config=triton_kernels.swiglu.PrecisionConfig(limit=1.0)) + + assert_close(ref=out_ref, tri=out) \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py new file mode 100644 index 0000000000..4482029c16 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -0,0 +1,230 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any, Optional + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceDelegate) +from vllm.model_executor.layers.fused_moe.utils import extract_required_args + +if True: + import triton_kernels.swiglu + from triton_kernels.matmul_ogs import (FnSpecs, FusedActivation, + PrecisionConfig, matmul_ogs) + from triton_kernels.routing import routing + + +def triton_kernel_moe_forward( + hidden_states: torch.Tensor, + w1, # Tensor or triton_kernels.Tensor + w2, # Tensor or triton_kernels.Tensor + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + use_fp8_w8a8: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None, + w1_precision=None, # PrecisionConfig or None + w2_precision=None, # PrecisionConfig or None + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[list[int]] = None, +) -> torch.Tensor: + + routing_data, gather_idx, scatter_idx = routing(gating_output, + topk, + sm_first=not renormalize) + + return triton_kernel_fused_experts( + None, + hidden_states, + w1, + w2, + routing_data, + gather_idx, + scatter_idx, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + use_fp8_w8a8=use_fp8_w8a8, + per_channel_quant=per_channel_quant, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_bias=w1_bias, + w2_bias=w2_bias, + w1_precision=w1_precision, + w2_precision=w2_precision, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_shape) + + +# This is a triton implementation of the fused_experts function +def triton_kernel_fused_experts( + output_tensor: torch.Tensor, + hidden_states: torch.Tensor, + w1, # Tensor or triton_kernels.Tensor + w2, # Tensor or triton_kernels.Tensor + routing_data, # RoutingData + gather_indx, # GatherIndx + scatter_indx, # ScatterIndx + activation: str = "silu", + swiglu_alpha: float = 1.702, + swiglu_limit: float = 7.0, + apply_router_weight_on_input: bool = False, + use_fp8_w8a8: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None, + w1_precision=None, # PrecisionConfig or None + w2_precision=None, # PrecisionConfig or None + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[list[int]] = None, +) -> torch.Tensor: + + # type check, uint8 means mxfp4 + assert hidden_states.dtype == torch.bfloat16 + assert w1_bias is None or w1_bias.dtype == torch.float32 + assert w2_bias is None or w2_bias.dtype == torch.float32 + + # Shape check, only check non-mxfp4 + assert hidden_states.shape[-1] == w1.shape[-2] + assert w2.shape[-1] == w1.shape[1] + + E, _, N = w1.shape + + if global_num_experts == -1: + global_num_experts = E + + act = FusedActivation( + FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")), + (swiglu_alpha, swiglu_limit), 2) + gammas = routing_data.gate_scal if routing_data else None + + intermediate_cache1 = matmul_ogs( + hidden_states, + w1, + w1_bias, + routing_data, + gather_indx=gather_indx, + precision_config=w1_precision, + gammas=gammas if apply_router_weight_on_input else None, + fused_activation=act) + + intermediate_cache3 = matmul_ogs( + intermediate_cache1, + w2, + w2_bias, + routing_data, + scatter_indx=scatter_indx, + precision_config=w2_precision, + gammas=None if apply_router_weight_on_input else gammas, + y=output_tensor, + ) + return intermediate_cache3 + + +class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): + + def __init__(self, quant_config, max_num_tokens: int, num_dispatchers: int, + w1_precision: PrecisionConfig, w2_precision: PrecisionConfig): + super().__init__(quant_config) + self.max_num_tokens = max_num_tokens + self.num_dispatchers = num_dispatchers + self.w1_precision = w1_precision + self.w2_precision = w2_precision + + @property + def activation_formats( + self + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return (mk.FusedMoEActivationFormat.BatchedExperts, + mk.FusedMoEActivationFormat.BatchedExperts) + + def supports_chunking(self) -> bool: + return False + + def supports_expert_map(self) -> bool: + return False + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # Let PrepareAndFinalize::finalize() decide the impl. + return TopKWeightAndReduceDelegate() + + def workspace_shapes( + self, a: torch.Tensor, aq: torch.Tensor, M: int, N: int, K: int, + topk: int, global_num_experts: int, local_num_experts: int, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata] + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: + # workspace are allocated inside the kernel + assert a.dim() == 2 + num_dp = self.num_dispatchers + num_experts = local_num_experts + max_num_tokens = self.max_num_tokens + workspace2 = (0, 0, 0) + output = (num_experts, max_num_tokens * num_dp, N) + return (output, workspace2, output, a.dtype) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + apply_router_weight_on_input: bool, + extra_expert_args: Optional[dict[str, Any]], + ): + w1_bias, w2_bias = (extract_required_args(extra_expert_args, + ["w1_bias", "w2_bias"])) + + return triton_kernel_fused_experts( + output, + hidden_states, + w1, + w2, + None, + None, + None, + activation=activation, + apply_router_weight_on_input=False, + use_fp8_w8a8=False, + per_channel_quant=False, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_bias=w1_bias, + w2_bias=w2_bias, + w1_precision=self.w1_precision, + w2_precision=self.w2_precision, + a1_scale=a1q_scale, + a2_scale=a2_scale) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 272b6ce672..d664a92841 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -36,7 +36,7 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum from vllm.utils import (direct_register_custom_op, has_deep_ep, has_pplx, - round_up) + has_triton_kernels, is_torch_equal_or_newer, round_up) from vllm.utils.flashinfer import has_flashinfer if current_platform.is_cuda_alike(): @@ -723,10 +723,17 @@ class FusedMoE(torch.nn.Module): self.global_num_experts = num_experts + num_redundant_experts # we padding globally so EP buffer allocation works - if quant_config and quant_config.get_name() == "mxfp4" and ( - envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 - or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): - hidden_size = round_up(hidden_size, 256) + if quant_config and quant_config.get_name() == "mxfp4": + if not is_torch_equal_or_newer("2.8.0"): + raise RuntimeError("Mxfp4 on hopper requires torch >= 2.8.0") + if current_platform.is_device_capability( + 90) and not has_triton_kernels(): + raise NotImplementedError( + "Triton kernels must be installed for mxfp4 on hopper") + if (current_platform.is_rocm() + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): + hidden_size = round_up(hidden_size, 256) # For smuggling this layer into the fused moe custom op compilation_config = vllm_config.compilation_config diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 068af02739..4e59aef480 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -8,16 +8,19 @@ from torch.nn.parameter import Parameter from vllm import envs from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, FusedMoEMethodBase) +from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( + triton_kernel_moe_forward) from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( - _can_support_mxfp4) + _can_support_mxfp4, _swizzle_mxfp4) from vllm.model_executor.layers.quantization.utils.quant_utils import ( is_layer_skipped) from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform from vllm.utils import next_power_of_2, round_up if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 @@ -39,7 +42,7 @@ class Mxfp4Config(QuantizationConfig): @classmethod def get_min_capability(cls) -> int: - return 100 + return 90 @classmethod def get_name(cls) -> QuantizationMethods: @@ -100,11 +103,18 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): intermediate_size_per_partition # pad the intermediate size to be a multiple of 2 * mxfp4_block # for to hold non-uniform sharded tensor as well as swizzling + # other padding to increase performance if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): intermediate_size_per_partition_after_pad = round_up( intermediate_size_per_partition, 256) hidden_size = round_up(hidden_size, 256) + elif current_platform.is_rocm(): + intermediate_size_per_partition_after_pad = round_up( + intermediate_size_per_partition, 128) + else: + intermediate_size_per_partition_after_pad = round_up( + intermediate_size_per_partition, 64) self.intermediate_size = intermediate_size_per_partition_after_pad self.hidden_size = hidden_size @@ -303,7 +313,41 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): layer.w2_bias = Parameter(torch.stack(gemm2_bias_shuffled).reshape( self.num_experts, -1), requires_grad=False) - return + else: + from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig + + w13_bias = layer.w13_bias.to(torch.float32) + w2_bias = layer.w2_bias.to(torch.float32) + + layer.w13_bias = Parameter(w13_bias, requires_grad=False) + layer.w2_bias = Parameter(w2_bias, requires_grad=False) + + # FIXME warp need to be adjusted based on batch size + # only apply to batched mode + if self.moe.use_ep: + num_warps = 4 if envs.VLLM_MOE_DP_CHUNK_SIZE <= 512 else 8 + else: + num_warps = 8 + + w13_weight, w13_flex, w13_scale = _swizzle_mxfp4( + layer.w13_weight, layer.w13_weight_scale, num_warps) + w2_weight, w2_flex, w2_scale = _swizzle_mxfp4( + layer.w2_weight, layer.w2_weight_scale, num_warps) + + self.w13_precision_config = PrecisionConfig( + weight_scale=w13_scale, flex_ctx=FlexCtx(rhs_data=w13_flex)) + self.w2_precision_config = PrecisionConfig( + weight_scale=w2_scale, flex_ctx=FlexCtx(rhs_data=w2_flex)) + + self.w13_weight_triton_tensor = w13_weight + self.w2_weight_triton_tensor = w2_weight + + # need to delete the original weights to save memory on single GPU + del layer.w13_weight + del layer.w2_weight + layer.w13_weight = None + layer.w2_weight = None + torch.cuda.empty_cache() def _get_tile_tokens_dim(self, x: torch.Tensor, top_k: int): # Number of tokens in the input tensor. @@ -404,3 +448,19 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): True, # do finalize )[0] return trtllm_gen_output + else: + return triton_kernel_moe_forward( + hidden_states=x, + w1=self.w13_weight_triton_tensor, + w2=self.w2_weight_triton_tensor, + gating_output=router_logits, + topk=top_k, + renormalize=renormalize, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_bias=layer.w13_bias, + w2_bias=layer.w2_bias, + w1_precision=self.w13_precision_config, + w2_precision=self.w2_precision_config, + apply_router_weight_on_input=apply_router_weight_on_input, + ) diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index 4a4e199e13..4084dd837c 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -4,11 +4,55 @@ from typing import Callable, Optional import torch -from vllm.utils import direct_register_custom_op +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer + +logger = init_logger(__name__) OCP_MX_BLOCK_SIZE = 32 +def _swizzle_mxfp4(quant_tensor, scale, num_warps): + """ weight swizzle for mxfp4 moe, used for OAI mxfp4 kernel + """ + import triton_kernels.matmul_ogs_details.opt_flags as opt_flags + from triton_kernels.numerics import InFlexData + from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor + from triton_kernels.tensor_details import layout + from triton_kernels.tensor_details.layout import StridedLayout + if (current_platform.is_cuda() + and current_platform.is_device_capability(90) + and not is_torch_equal_or_newer("2.8.1")): + logger.warning_once( + "Mxfp4 on hopper is running on torch < 2.8.1, " + "this cause swizling to be disabled, which may " + "cause performance degradation. Please upgrade to torch nightly") + value_layout, value_layout_opts = StridedLayout, dict() + scale_layout, scale_layout_opts = StridedLayout, dict() + else: + value_layout, value_layout_opts = \ + layout.make_default_matmul_mxfp4_w_layout(mx_axis=1) + scale_layout, scale_layout_opts = ( + layout.make_default_matmul_mxfp4_w_scale_layout( + mx_axis=1, num_warps=num_warps)) + if current_platform.is_cuda() and \ + current_platform.is_device_capability(100): + constraints = { + "is_persistent": True, + "epilogue_subtile": 1, + } + opt_flags.update_opt_flags_constraints(constraints) + # transpose the tensor so that the quantization axis is on dim1 + quant_tensor = quant_tensor.transpose(-2, -1) + scale = scale.transpose(-2, -1) + quant_tensor = convert_layout(wrap_torch_tensor(quant_tensor, dtype=FP4), + value_layout, **value_layout_opts) + scale = convert_layout(wrap_torch_tensor(scale), scale_layout, + **scale_layout_opts) + return quant_tensor, InFlexData(), scale + + def _can_support_mxfp4(use_grouped_topk: bool = False, topk_group: Optional[int] = None, num_expert_group: Optional[int] = None, diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index cd32f12f3c..48a347a8f5 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -11,6 +11,27 @@ from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op +def shuffle_weight(w: torch.Tensor) -> torch.Tensor: + # Shuffle weight along the last dimension so that + # we folded the weights to adjance location + # Example: + # input: + # [[1, 2, 3, 4, 5, 6], + # [7, 8, 9, 10, 11, 12]] + # output: + # [[1, 4, 2, 5, 3, 6], + # [7, 10, 8, 11, 9, 12]] + # This will be used together with triton swiglu kernel + shape = w.shape + N = shape[-1] + first = w[..., :N // 2] + second = w[..., N // 2:] + + stacked = torch.stack((first, second), dim=-1) + w_shuffled = stacked.reshape(shape) + return w_shuffled + + def get_token_bin_counts_and_mask( tokens: torch.Tensor, vocab_size: int, diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index e39cdf76dc..7a0abf5b59 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -3254,6 +3254,12 @@ def has_deep_gemm() -> bool: return _has_module("deep_gemm") +def has_triton_kernels() -> bool: + """Whether the optional `triton_kernels` package is available.""" + + return _has_module("triton_kernels") + + def set_process_title(name: str, suffix: str = "", append: bool = False) -> None: From f0964e29cb3b2deccdad89f5f8c068d3a629d239 Mon Sep 17 00:00:00 2001 From: Daniel Serebrenik <74646983+pliops-daniels@users.noreply.github.com> Date: Fri, 8 Aug 2025 20:28:50 +0300 Subject: [PATCH 093/932] [Benchmark] Add benchmark tool for multi turn conversations (#20267) --- benchmarks/multi_turn/README.md | 71 + benchmarks/multi_turn/bench_dataset.py | 493 ++++++ benchmarks/multi_turn/bench_utils.py | 25 + .../benchmark_serving_multi_turn.py | 1557 +++++++++++++++++ .../multi_turn/convert_sharegpt_to_openai.py | 354 ++++ .../multi_turn/generate_multi_turn.json | 35 + benchmarks/multi_turn/requirements.txt | 5 + 7 files changed, 2540 insertions(+) create mode 100644 benchmarks/multi_turn/README.md create mode 100644 benchmarks/multi_turn/bench_dataset.py create mode 100644 benchmarks/multi_turn/bench_utils.py create mode 100644 benchmarks/multi_turn/benchmark_serving_multi_turn.py create mode 100644 benchmarks/multi_turn/convert_sharegpt_to_openai.py create mode 100644 benchmarks/multi_turn/generate_multi_turn.json create mode 100644 benchmarks/multi_turn/requirements.txt diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md new file mode 100644 index 0000000000..ae0866ae60 --- /dev/null +++ b/benchmarks/multi_turn/README.md @@ -0,0 +1,71 @@ +# Benchmark KV Cache Offloading with Multi-Turn Conversations + +The requirements (pip) for `benchmark_serving_multi_turn.py` can be found in `requirements.txt` + +First start serving your model + +```bash +export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/ + +vllm serve $MODEL_NAME --disable-log-requests +``` + +## Synthetic Multi-Turn Conversations + +Download the following text file (used for generation of synthetic conversations) + +```bash +wget https://www.gutenberg.org/ebooks/1184.txt.utf-8 +mv 1184.txt.utf-8 pg1184.txt +``` + +The filename `pg1184.txt` is used in `generate_multi_turn.json` (see `"text_files"`). + +But you may use other text files if you prefer (using this specific file is not required). + +Then run the benchmarking script + +```bash +export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/ + +python benchmark_serving_multi_turn.py --model $MODEL_NAME --input-file generate_multi_turn.json \ +--num-clients 2 --max-active-conversations 6 +``` + +You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.). + +If successful, you will see the following output + +```bash +---------------------------------------------------------------------------------------------------- +Statistics summary: +runtime_sec = 215.810 +requests_per_sec = 0.769 +---------------------------------------------------------------------------------------------------- + count mean std min 25% 50% 75% 90% 99% max +ttft_ms 166.0 78.22 67.63 45.91 59.94 62.26 64.43 69.66 353.18 567.54 +tpot_ms 166.0 25.37 0.57 24.40 25.07 25.31 25.50 25.84 27.50 28.05 +latency_ms 166.0 2591.07 326.90 1998.53 2341.62 2573.01 2860.10 3003.50 3268.46 3862.94 +input_num_turns 166.0 7.43 4.57 1.00 3.00 7.00 11.00 13.00 17.00 17.00 +input_num_tokens 166.0 2006.20 893.56 522.00 1247.75 2019.00 2718.00 3233.00 3736.45 3899.00 +output_num_tokens 166.0 100.01 11.80 80.00 91.00 99.00 109.75 116.00 120.00 120.00 +output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75 115.00 119.00 119.00 +---------------------------------------------------------------------------------------------------- +``` + +## ShareGPT Conversations + +To run with the ShareGPT data, download the following ShareGPT dataset: +`https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json` + +Use the `convert_sharegpt_to_openai.py` script to convert the dataset to a format supported by `benchmark_serving_multi_turn.py` + +```bash +python convert_sharegpt_to_openai.py sharegpt_20230401_clean_lang_split.json sharegpt_conv_128.json --seed=99 --max-items=128 +``` + +The script will convert the ShareGPT dataset to a dataset with the standard user/assistant roles. + +The flag `--max-items=128` is used to sample 128 conversations from the original dataset (change as needed). + +Use the output JSON file `sharegpt_conv_128.json` as the `--input-file` for `benchmark_serving_multi_turn.py`. diff --git a/benchmarks/multi_turn/bench_dataset.py b/benchmarks/multi_turn/bench_dataset.py new file mode 100644 index 0000000000..411b89dd23 --- /dev/null +++ b/benchmarks/multi_turn/bench_dataset.py @@ -0,0 +1,493 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod +from statistics import mean +from typing import Any, NamedTuple, Optional, Union + +import numpy as np # type: ignore +import pandas as pd # type: ignore +from bench_utils import ( + TEXT_SEPARATOR, + Color, + logger, +) +from transformers import AutoTokenizer # type: ignore + +# Conversation ID is a string (e.g: "UzTK34D") +ConvId = str + +# A list of dicts (dicts with keys "id" and "messages") +ShareGptConversations = list[dict[str, Any]] + +# A list of dicts (dicts with keys "role" and "content") +MessagesList = list[dict[str, str]] + +# Map conversation ID to conversation messages +ConversationsMap = list[ConvId, MessagesList] + + +class Distribution(ABC): + @abstractmethod + def sample(self, size: int = 1) -> np.ndarray: + pass + + +class UniformDistribution(Distribution): + def __init__( + self, + min_val: Union[int, float], + max_val: Union[int, float], + is_integer: bool = True, + ) -> None: + self.min_val = min_val + self.max_val = max_val + self.is_integer = is_integer + + def sample(self, size: int = 1) -> np.ndarray: + if self.is_integer: + return np.random.randint( + int(self.min_val), int(self.max_val + 1), size=size + ) + else: + return np.random.uniform(self.min_val, self.max_val, size=size) + + def __repr__(self) -> str: + return f"UniformDistribution[{self.min_val}, {self.max_val}]" + + +class ConstantDistribution(Distribution): + def __init__(self, value: Union[int, float]) -> None: + self.value = value + self.max_val = value + + def sample(self, size: int = 1) -> np.ndarray: + return np.full(shape=size, fill_value=self.value) + + def __repr__(self) -> str: + return f"Constant[{self.value}]" + + +class ZipfDistribution(Distribution): + def __init__(self, alpha: float, max_val: Optional[int] = None) -> None: + self.alpha = alpha + self.max_val = max_val + + def sample(self, size: int = 1) -> np.ndarray: + samples = np.random.zipf(self.alpha, size=size) + if self.max_val: + samples = np.minimum(samples, self.max_val) + return samples + + def __repr__(self) -> str: + return f"ZipfDistribution[{self.alpha}]" + + +class PoissonDistribution(Distribution): + def __init__(self, alpha: float, max_val: Optional[int] = None) -> None: + self.alpha = alpha + self.max_val = max_val + + def sample(self, size: int = 1) -> np.ndarray: + samples = np.random.poisson(self.alpha, size=size) + if self.max_val: + samples = np.minimum(samples, self.max_val) + return samples + + def __repr__(self) -> str: + return f"PoissonDistribution[{self.alpha}]" + + +class LognormalDistribution(Distribution): + def __init__( + self, mean: float, sigma: float, max_val: Optional[int] = None + ) -> None: + self.mean = mean + self.sigma = sigma + self.max_val = max_val + + def sample(self, size: int = 1) -> np.ndarray: + samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size) + if self.max_val: + samples = np.minimum(samples, self.max_val) + + return np.round(samples).astype(int) + + def __repr__(self) -> str: + return f"LognormalDistribution[{self.mean}, {self.sigma}]" + + +class GenConvArgs(NamedTuple): + num_conversations: int + text_files: list[str] + input_num_turns: Distribution + input_common_prefix_num_tokens: Distribution + input_prefix_num_tokens: Distribution + input_num_tokens: Distribution + output_num_tokens: Distribution + print_stats: bool + + +def verify_field_exists( + conf: dict, field_name: str, section: str, subsection: str +) -> None: + if field_name not in conf: + raise ValueError( + f"Missing field '{field_name}' in {section=} and {subsection=}" + ) + + +def get_random_distribution( + conf: dict, section: str, subsection: str, optional: bool = False +) -> Distribution: + # section can be "prompt_input" or "prompt_output" (both required) + conf = conf[section] + + if optional and subsection not in conf: + # Optional subsection, if not found assume the value is always 0 + return ConstantDistribution(0) + + # subsection can be "num_turns", "num_tokens" or "prefix_num_tokens" + if subsection not in conf: + raise ValueError(f"Missing subsection {subsection} in section {section}") + + conf = conf[subsection] + + distribution = conf.get("distribution") + if distribution is None: + raise ValueError( + f"Missing field 'distribution' in {section=} and {subsection=}" + ) + + if distribution == "constant": + verify_field_exists(conf, "value", section, subsection) + return ConstantDistribution(conf["value"]) + + elif distribution == "zipf": + verify_field_exists(conf, "alpha", section, subsection) + max_val = conf.get("max", None) + return ZipfDistribution(conf["alpha"], max_val=max_val) + + elif distribution == "poisson": + verify_field_exists(conf, "alpha", section, subsection) + max_val = conf.get("max", None) + return PoissonDistribution(conf["alpha"], max_val=max_val) + + elif distribution == "lognormal": + verify_field_exists(conf, "mean", section, subsection) + verify_field_exists(conf, "sigma", section, subsection) + max_val = conf.get("max", None) + return LognormalDistribution(conf["mean"], conf["sigma"], max_val=max_val) + + elif distribution == "uniform": + verify_field_exists(conf, "min", section, subsection) + verify_field_exists(conf, "max", section, subsection) + + min_value = conf["min"] + max_value = conf["max"] + + assert min_value > 0 + assert min_value <= max_value + + is_integer = isinstance(min_value, int) and isinstance(max_value, int) + return UniformDistribution(min_value, max_value, is_integer) + else: + raise ValueError(f"Unknown distribution: {distribution}") + + +def parse_input_json_file(conf: dict) -> GenConvArgs: + # Validate the input file + assert isinstance(conf, dict) + required_fields = [ + "filetype", + "num_conversations", + "text_files", + "prompt_input", + "prompt_output", + ] + for field in required_fields: + assert field in conf, f"Missing field {field} in input {conf}" + + assert conf["filetype"] == "generate_conversations" + + assert conf["num_conversations"] > 0, "num_conversations should be larger than zero" + + text_files = conf["text_files"] + + assert isinstance(text_files, list), "Field 'text_files' should be a list" + assert len(text_files) > 0, ( + "Field 'text_files' should be a list with at least one file" + ) + + # Parse the parameters for the prompt input/output workload + input_num_turns = get_random_distribution(conf, "prompt_input", "num_turns") + input_num_tokens = get_random_distribution(conf, "prompt_input", "num_tokens") + input_common_prefix_num_tokens = get_random_distribution( + conf, "prompt_input", "common_prefix_num_tokens", optional=True + ) + input_prefix_num_tokens = get_random_distribution( + conf, "prompt_input", "prefix_num_tokens" + ) + output_num_tokens = get_random_distribution(conf, "prompt_output", "num_tokens") + + print_stats: bool = conf.get("print_stats", False) + assert isinstance(print_stats, bool), ( + "Field 'print_stats' should be either 'true' or 'false'" + ) + + args = GenConvArgs( + num_conversations=conf["num_conversations"], + text_files=text_files, + input_num_turns=input_num_turns, + input_common_prefix_num_tokens=input_common_prefix_num_tokens, + input_prefix_num_tokens=input_prefix_num_tokens, + input_num_tokens=input_num_tokens, + output_num_tokens=output_num_tokens, + print_stats=print_stats, + ) + return args + + +def print_conv_stats(conversations: ConversationsMap, tokenizer: AutoTokenizer) -> None: + # Collect statistics + conv_stats: list[dict[Any, Any]] = [] + req_stats: list[int] = [] + + print("\nCollecting statistics...") + for messages in conversations.values(): + # messages is a list of dicts + user_tokens: list[int] = [] + assistant_tokens: list[int] = [] + request_tokens: list[int] = [] + + req_tokens = 0 + for m in messages: + content = m["content"] + num_tokens = len(tokenizer(content).input_ids) + + if m["role"] == "user": + user_tokens.append(num_tokens) + # New user prompt including all chat history + req_tokens += num_tokens + request_tokens.append(req_tokens) + + elif m["role"] == "assistant": + assistant_tokens.append(num_tokens) + # Update assistant answer + # (will be part of chat history for the next user prompt) + req_tokens += num_tokens + + item_stats = { + "conversation_turns": len(messages), + "user_tokens": mean(user_tokens), + "assistant_tokens": mean(assistant_tokens), + } + + conv_stats.append(item_stats) + req_stats.extend(request_tokens) + + # Print statistics + percentiles = [0.25, 0.5, 0.75, 0.9, 0.99] + + print(TEXT_SEPARATOR) + print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}") + print(TEXT_SEPARATOR) + df = pd.DataFrame(conv_stats) + print(df.describe(percentiles=percentiles).transpose()) + print(TEXT_SEPARATOR) + print(f"{Color.YELLOW}Request statistics:{Color.RESET}") + print(TEXT_SEPARATOR) + df = pd.DataFrame(req_stats, columns=["request_tokens"]) + print(df.describe(percentiles=percentiles).transpose()) + print(TEXT_SEPARATOR) + + +def generate_conversations( + args: GenConvArgs, tokenizer: AutoTokenizer +) -> ConversationsMap: + # Text for all user prompts + # (text from the input text files will be appended to this line) + base_prompt_text = "Please rewrite the following text and add more content: " + base_prompt_token_count = len( + tokenizer.encode(base_prompt_text, add_special_tokens=False) + ) + + logger.info(f"{Color.PURPLE}Generating conversations...{Color.RESET}") + logger.info(args) + + list_of_tokens = [] + + for filename in args.text_files: + # Load text file that will be used to generate prompts + with open(filename) as file: + data = file.read() + tokens_in_file = tokenizer.encode(data, add_special_tokens=False) + list_of_tokens.extend(tokens_in_file) + + conversations: ConversationsMap = {} + conv_id = 0 + + # Generate number of turns for every conversation + turn_count: np.ndarray = args.input_num_turns.sample(args.num_conversations) + + # Turn count should be at least 2 (one user prompt and one assistant answer) + turn_count = np.maximum(turn_count, 2) + + # Round up to an even number (every user prompt should have an answer) + turn_count = turn_count + (turn_count % 2) + + # Generate number of prefix tokens for every conversation + conv_prefix_tokens: np.ndarray = args.input_prefix_num_tokens.sample( + args.num_conversations + ) + + # Used to reduce shared text between conversations + # (jump/skip over text sections between conversations) + base_offset = 0 + + # Common prefix size for all conversations (only 1 sample required) + common_prefix_text = "" + common_prefix_tokens: int = args.input_common_prefix_num_tokens.sample(1)[0] + if common_prefix_tokens > 0: + # Using "." at the end to separate sentences + common_prefix_text = ( + tokenizer.decode(list_of_tokens[: common_prefix_tokens - 2]) + "." + ) + base_offset += common_prefix_tokens + + for conv_id in range(args.num_conversations): + # Generate a single conversation + messages: MessagesList = [] + + nturns = turn_count[conv_id] + + # User prompt token count per turn (with lower limit) + input_token_count: np.ndarray = args.input_num_tokens.sample(nturns) + input_token_count = np.maximum(input_token_count, base_prompt_token_count) + + # Assistant answer token count per turn (with lower limit) + output_token_count: np.ndarray = args.output_num_tokens.sample(nturns) + output_token_count = np.maximum(output_token_count, 1) + + user_turn = True + for turn_id in range(nturns): + if user_turn: + role = "user" + num_tokens = input_token_count[turn_id] + + # Generate the user prompt, + # use a unique prefix (the conv_id) for each conversation + # (to avoid shared prefix between conversations) + content = f"{conv_id} is a nice number... " + + if len(common_prefix_text) > 0 and turn_id == 0: + content = common_prefix_text + content + + # Update the number of tokens left for the content + num_tokens -= len(tokenizer.encode(content, add_special_tokens=False)) + + if turn_id == 0: + prefix_num_tokens = conv_prefix_tokens[conv_id] + if prefix_num_tokens > 0: + # Add prefix text (context) to the first turn + start_offset = base_offset + end_offset = start_offset + prefix_num_tokens + assert len(list_of_tokens) > end_offset, ( + "Not enough input text to generate " + f"{prefix_num_tokens} tokens for the " + f"prefix text ({start_offset=}, {end_offset=})" + ) + + content += f"{conv_id}, " + tokenizer.decode( + list_of_tokens[start_offset:end_offset] + ) + base_offset += prefix_num_tokens + + # Add the actual user prompt/question after the prefix text + content += base_prompt_text + num_tokens -= base_prompt_token_count + + if num_tokens > 0: + # Add text from the input file (to reach the desired token count) + start_offset = base_offset + turn_id * input_token_count.max() + end_offset = start_offset + num_tokens + assert len(list_of_tokens) > end_offset, ( + f"Not enough input text to generate {num_tokens} tokens " + f"for the prompt ({start_offset=}, {end_offset=})" + ) + + # Convert tokens back to text + content += tokenizer.decode(list_of_tokens[start_offset:end_offset]) + else: + role = "assistant" + # This content will not be used as input to the LLM server + # (actual answers will be used instead). + # Content is only required to determine the min_tokens/max_tokens + # (inputs to the LLM server). + num_tokens = output_token_count[turn_id] + assert len(list_of_tokens) > num_tokens, ( + f"Not enough input text to generate {num_tokens} " + "tokens for assistant content" + ) + content = tokenizer.decode(list_of_tokens[:num_tokens]) + + # Append the user/assistant message to the list of messages + messages.append({"role": role, "content": content}) + user_turn = not user_turn + + # Add the new conversation + conversations[f"CONV_ID_{conv_id}"] = messages + + # Increase base offset for the next conversation + base_offset += nturns + + if args.print_stats: + print_conv_stats(conversations, tokenizer) + + return conversations + + +def conversations_list_to_dict(input_list: ShareGptConversations) -> ConversationsMap: + conversations: ConversationsMap = {} + + for item in input_list: + conv_id: str = item["id"] + assert isinstance(conv_id, str) + + assert conv_id not in conversations, ( + f"Conversation ID {conv_id} found more than once in the input" + ) + + messages: MessagesList = item["messages"] + assert isinstance(messages, list), ( + f"Conversation messages should be a list (ID: {conv_id})" + ) + assert len(messages) > 0, f"Conversation with no messages (ID: {conv_id})" + + conversations[conv_id] = messages + + logger.info(f"Using {len(conversations)} unique conversations (IDs)") + assert len(conversations) == len(input_list) + + # Print statistics about the selected conversations + stats: list[dict[str, Any]] = [] + for conv_data in conversations.values(): + stats.append({"num_turns": len(conv_data)}) + + print(TEXT_SEPARATOR) + print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}") + print(TEXT_SEPARATOR) + percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999] + conv_stats = pd.DataFrame(stats).describe(percentiles=percentiles) + print(conv_stats.transpose()) + print(TEXT_SEPARATOR) + + return conversations + + +def conversations_dict_to_list(input_dict: ConversationsMap) -> ShareGptConversations: + output: ShareGptConversations = [] + for conv_id, conv_data in input_dict.items(): + new_item = {"id": conv_id, "messages": conv_data} + output.append(new_item) + + return output diff --git a/benchmarks/multi_turn/bench_utils.py b/benchmarks/multi_turn/bench_utils.py new file mode 100644 index 0000000000..d4d3c1ca8c --- /dev/null +++ b/benchmarks/multi_turn/bench_utils.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import logging +from enum import Enum + + +class Color(str, Enum): + RED = "\033[91m" + GREEN = "\033[92m" + BLUE = "\033[94m" + PURPLE = "\033[95m" + CYAN = "\033[96m" + YELLOW = "\033[93m" + RESET = "\033[0m" + + +TEXT_SEPARATOR = "-" * 100 + +# Configure the logger +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] - %(message)s", + datefmt="%d-%m-%Y %H:%M:%S", +) +logger = logging.getLogger(__name__) diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py new file mode 100644 index 0000000000..53c3207491 --- /dev/null +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -0,0 +1,1557 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import asyncio +import json +import logging +import multiprocessing as mp +import os +import random +import time +from collections import Counter, deque +from datetime import datetime +from enum import Enum +from http import HTTPStatus +from statistics import mean +from typing import NamedTuple, Optional, Union + +import aiohttp # type: ignore +import numpy as np # type: ignore +import pandas as pd # type: ignore +from bench_dataset import ( + ConversationsMap, + ConvId, + GenConvArgs, + MessagesList, + ShareGptConversations, + conversations_dict_to_list, + conversations_list_to_dict, + generate_conversations, + parse_input_json_file, +) +from bench_utils import TEXT_SEPARATOR, Color, logger +from transformers import AutoTokenizer # type: ignore + +NUM_TOKENS_FROM_DATASET = 0 +TERM_SIGNAL = None + + +class ConversationSampling(str, Enum): + ROUND_ROBIN = "round_robin" + RANDOM = "random" + + def __str__(self): + return self.value + + +class ClientArgs(NamedTuple): + seed: int + max_num_requests: Optional[int] + skip_first_turn: bool + max_turns: Optional[int] + max_active_conversations: int + verbose: bool + print_content: bool + verify_output: bool + conversation_sampling: ConversationSampling + request_rate: float + + +class RequestArgs(NamedTuple): + chat_url: str + model: str + stream: bool + limit_min_tokens: int # Use negative value for no limit + limit_max_tokens: int # Use negative value for no limit + + +class BenchmarkArgs(NamedTuple): + url: str + num_clients: int + early_stop: bool + + +class ServerResponse(NamedTuple): + valid: bool + ttft_ms: float # time to first chunk + tpot_ms: float # time per output chunk (one or more tokens) + latency_ms: float + start_time_ms: float + first_chunk: str # first chunk of the content + content: str # includes the first_chunk + num_chunks: int + + def __str__(self) -> str: + return f"ttft_ms {self.ttft_ms:.2f}, tpot_ms {self.tpot_ms:.2f}, latency_ms {self.latency_ms:.2f}" # noqa: E501 + + +class RequestStats(NamedTuple): + ttft_ms: float + tpot_ms: float + latency_ms: float + start_time_ms: float + input_num_turns: int + input_num_tokens: int + output_num_tokens: int + output_num_chunks: int + output_num_first_chunk_tokens: int + approx_cached_percent: float + conversation_id: str + client_id: int + + def __str__(self) -> str: + return ( + f"ttft_ms {self.ttft_ms:.2f}, tpot_ms {self.tpot_ms:.2f}, latency_ms {self.latency_ms:.2f}, input_num_tokens {self.input_num_tokens}, " # noqa: E501 + f"output_num_tokens {self.output_num_tokens} ({self.output_num_chunks} chunks, {self.output_num_first_chunk_tokens} tokens in first chunk), " # noqa: E501 + f"approx_cached_percent {self.approx_cached_percent:.2f}%" + ) + + +class MetricStats: + def __init__(self) -> None: + self.min: Optional[float] = None + self.max: Optional[float] = None + self.avg: Optional[float] = None + self.sum = 0.0 + self.count = 0 + + def update(self, value: float) -> None: + if self.min is None: + self.min = value + else: + self.min = min(self.min, value) + + if self.max is None: + self.max = value + else: + self.max = max(self.max, value) + + self.sum += value + self.count += 1 + self.avg = self.sum / self.count + + def __repr__(self) -> str: + if self.count == 0: + return "no data" + return f"avg: {self.avg:>10.3f}, min: {self.min:>10.3f}, max: {self.max:>10.3f}" + + +class MovingAverage: + def __init__(self, window_size: int) -> None: + self.window_size = window_size + self.window = np.zeros(window_size) + self.index = 0 + self.sum = 0.0 + self.count = 0 + self.avg: Optional[float] = None + + def update(self, new_value: float) -> None: + if self.count < self.window_size: + # Filling up the window + self.sum += new_value + self.window[self.count] = new_value + self.count += 1 + else: + # Window is full, start replacing old values + old_value = self.window[self.index] + self.sum = self.sum - old_value + new_value + self.window[self.index] = new_value + self.index = (self.index + 1) % self.window_size + + self.avg = self.sum / self.count + + def __repr__(self) -> str: + if self.count == 0: + return "no data" + return f"avg: {self.avg:>10.3f} ({self.count} samples)" + + +class DebugStats: + def __init__(self, logger: logging.Logger, window_size: int) -> None: + self.logger = logger + self.metrics: dict[str, Union[MovingAverage, MetricStats]] = { + "moving_avg_ttft_ms": MovingAverage(window_size), + "moving_avg_tpot_ms": MovingAverage(window_size), + "ttft_ms": MetricStats(), + "tpot_ms": MetricStats(), + "latency_ms": MetricStats(), + "input_num_turns": MetricStats(), + "input_num_tokens": MetricStats(), + "output_num_tokens": MetricStats(), + } + + def update(self, data: RequestStats) -> None: + self.metrics["ttft_ms"].update(data.ttft_ms) + self.metrics["moving_avg_ttft_ms"].update(data.ttft_ms) + self.metrics["tpot_ms"].update(data.tpot_ms) + self.metrics["moving_avg_tpot_ms"].update(data.tpot_ms) + self.metrics["latency_ms"].update(data.latency_ms) + self.metrics["input_num_turns"].update(data.input_num_turns) + self.metrics["input_num_tokens"].update(data.input_num_tokens) + self.metrics["output_num_tokens"].update(data.output_num_tokens) + + def print(self) -> None: + self.logger.info("-" * 50) + for k, v in self.metrics.items(): + kv_info = f"[{k:25}] {v}" + self.logger.info(kv_info) + self.logger.info("-" * 50) + + +# Must support Python 3.8, we can't use str.removeprefix(prefix) +# introduced in Python 3.9 +def remove_prefix(text: str, prefix: str) -> str: + if text.startswith(prefix): + return text[len(prefix) :] + return text + + +def nanosec_to_millisec(value: float) -> float: + return value / 1000000.0 + + +def nanosec_to_sec(value: float) -> float: + return value / 1000000000.0 + + +async def send_request( + session: aiohttp.ClientSession, + messages: list[dict[str, str]], + chat_url: str, + model: str, + stream: bool = True, + min_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, +) -> ServerResponse: + payload = { + "model": model, + "messages": messages, + "seed": 0, + "temperature": 0.0, + } + + if stream: + payload["stream"] = True + payload["stream_options"] = {"include_usage": False} + + if min_tokens is not None: + payload["min_tokens"] = min_tokens + + if max_tokens is not None: + payload["max_tokens"] = max_tokens + + headers = {"Content-Type": "application/json"} + + # Calculate the timeout for the request + timeout_sec = 120 + if max_tokens is not None: + # Assume TPOT of 200ms and use max_tokens to determine timeout + timeout_sec = max(timeout_sec, int(max_tokens * 0.2)) + timeout = aiohttp.ClientTimeout(total=timeout_sec) + + valid_response = True + ttft: Optional[float] = None + chunk_delay: list[int] = [] + latency: Optional[float] = None + first_chunk = "" + generated_text = "" + + start_time: int = time.perf_counter_ns() + most_recent_timestamp: int = start_time + + async with session.post( + url=chat_url, json=payload, headers=headers, timeout=timeout + ) as response: + http_status = HTTPStatus(response.status) + if http_status == HTTPStatus.OK: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ") + if chunk == "[DONE]": + # End of stream + latency = time.perf_counter_ns() - start_time + elif stream is False: + data = json.loads(chunk) + message = data["choices"][0]["message"] + assert message["role"] == "assistant" + generated_text += message["content"] + else: + timestamp: int = time.perf_counter_ns() + data = json.loads(chunk) + + # Delta is the new content/text/data + delta = data["choices"][0]["delta"] + if delta.get("content", None): + if ttft is None: + # First token + first_token_time = time.perf_counter_ns() + ttft = first_token_time - start_time + first_chunk = delta["content"] + else: + # Decoding phase + chunk_delay.append(timestamp - most_recent_timestamp) + + generated_text += delta["content"] + + most_recent_timestamp = timestamp + else: + valid_response = False + content = await response.text() + logger.warning( + f"{Color.YELLOW}Received HTTP status {http_status.value} " + f"({http_status.phrase}): {content}{Color.RESET}" + ) + + if latency is None: + latency = -1.0 + if valid_response: + # Streaming is disabled, latency was not set + latency = time.perf_counter_ns() - start_time + + if ttft is None: + # The response was a single chunk + ttft = latency + + # Each chunk may include more than one token + tpot: float = mean(chunk_delay) if len(chunk_delay) > 0 else 0.0 + num_chunks: int = len(chunk_delay) + + sr = ServerResponse( + valid=valid_response, + ttft_ms=nanosec_to_millisec(ttft) if ttft > 0.0 else -1.0, + tpot_ms=nanosec_to_millisec(tpot), + latency_ms=nanosec_to_millisec(latency), + start_time_ms=nanosec_to_millisec(start_time), + first_chunk=first_chunk, + content=generated_text, + num_chunks=num_chunks, + ) + return sr + + +def get_short_string(input: str) -> str: + n = 20 + if len(input) < 400: + return input + + return f"{input[:n]}...{input[-n:]}" + + +def get_token_count(tokenizer: AutoTokenizer, text: str) -> int: + return len(tokenizer(text, add_special_tokens=False).input_ids) + + +def get_messages_token_count( + tokenizer: AutoTokenizer, messages: list[dict[str, str]] +) -> int: + token_count = 0 + for m in messages: + token_count += get_token_count(tokenizer, m["content"]) + + return token_count + + +async def send_turn( + session: aiohttp.ClientSession, + client_id: int, + conv_id: str, + conversation_messages: MessagesList, + messages_to_use: int, + tokenizer: AutoTokenizer, + req_args: RequestArgs, + verbose: bool, + verify_output: bool, +) -> Optional[RequestStats]: + assert messages_to_use > 0 + assert messages_to_use <= len(conversation_messages) + + messages = conversation_messages[:messages_to_use] + + # Index of the next message (the role should be "user") + index = messages_to_use - 1 + + # Verify that the message has only two keys, "role" and "content" + assert len(messages[index].keys()) == 2 + assert "role" in messages[index] and "content" in messages[index] + assert messages[index]["role"] == "user", ( + f"Failed on conversation ID {conv_id}, message role should be user" + ) + + if verbose: + print( + f"{Color.CYAN}Messages (conversation ID {conv_id}," + f" {len(messages)} turns):{Color.RESET}", + messages, + ) + + # None means that there is no upper/lower limit for the output token count + min_tokens = None if req_args.limit_min_tokens < 0 else req_args.limit_min_tokens + max_tokens = None if req_args.limit_max_tokens < 0 else req_args.limit_max_tokens + + if len(conversation_messages) > messages_to_use: + # The conversation contains an assistant answer for the next user prompt + if ( + min_tokens == NUM_TOKENS_FROM_DATASET + or max_tokens == NUM_TOKENS_FROM_DATASET + ): + # Compute number of tokens in the answer (from the input conversation) + assistant_answer = conversation_messages[messages_to_use] + answer_num_tokens = get_token_count(tokenizer, assistant_answer["content"]) + assert assistant_answer["role"] == "assistant" + + if min_tokens == NUM_TOKENS_FROM_DATASET: + min_tokens = max(1, answer_num_tokens) + + if max_tokens == NUM_TOKENS_FROM_DATASET: + max_tokens = max(1, answer_num_tokens) + + # Send the current conversation to LLM and get a response + response: ServerResponse = await send_request( + session, + messages, + req_args.chat_url, + req_args.model, + req_args.stream, + min_tokens, + max_tokens, + ) + + if response.valid is False: + # Request failed + return None + + # Compute number of tokens in input / output + input_num_tokens = get_messages_token_count(tokenizer, messages) + + # Num tokens in the user's last question + question_num_tokens = get_token_count(tokenizer, messages[index]["content"]) + + # Num tokens in the history/context of the question + assert input_num_tokens >= question_num_tokens + history_num_tokens = input_num_tokens - question_num_tokens + + # Num tokens in the LLM's answer (first chunk and full answer) + first_chunk_tokens = get_token_count(tokenizer, response.first_chunk) + + output_content = response.content + output_num_tokens = get_token_count(tokenizer, output_content) + + # Prefix caching approximated cached percent + approx_cached_percent = ( + 100.0 * (history_num_tokens / input_num_tokens) if input_num_tokens > 0 else 0.0 + ) + + # Compute the correct TTFT and TPOT (based on tokens and not chunks). + # Required because multiple output tokens may be bundled in a single chunk. + if output_num_tokens > 1 and output_num_tokens > first_chunk_tokens: + # More than one token and more than one chunk in the output + decode_ms = response.latency_ms - response.ttft_ms + decode_num_tokens = output_num_tokens - first_chunk_tokens + tpot_ms = decode_ms / decode_num_tokens + else: + # In this case: output_num_tokens == first_chunk_tokens + # Output was a single chunk (output_num_tokens > 1) + # or even a single token (output_num_tokens == 1) + tpot_ms = 0.0 + + if first_chunk_tokens > 1: + # First chunk had multiple tokens, adjust TTFT for a single token + delta_ms = (first_chunk_tokens - 1) * tpot_ms + ttft_ms = max(0.1, response.ttft_ms - delta_ms) + else: + # First chunk had only one token + ttft_ms = response.ttft_ms + + rs = RequestStats( + ttft_ms=ttft_ms, + tpot_ms=tpot_ms, + latency_ms=response.latency_ms, + start_time_ms=response.start_time_ms, + input_num_turns=len(messages), + input_num_tokens=input_num_tokens, + output_num_tokens=output_num_tokens, + output_num_chunks=response.num_chunks, + output_num_first_chunk_tokens=first_chunk_tokens, + approx_cached_percent=approx_cached_percent, + conversation_id=conv_id, + client_id=client_id, + ) + + if verbose: + print( + f"\n{Color.YELLOW}Response ({output_num_tokens} tokens):{Color.RESET}", + output_content, + ) + print(f"{Color.YELLOW}Response metrics: {rs}{Color.RESET}") + print("-" * 70) + + # Save the LLM's answer (will be used as part of the context for the next user turn) + answer_index = messages_to_use + if len(conversation_messages) > answer_index: + assert conversation_messages[answer_index]["role"] == "assistant", ( + f"Failed on conversation ID {conv_id}, message role should be assistant" + ) + + orig_content = conversation_messages[answer_index]["content"] + if verify_output: + # Compare the new answer to the answer from the input file + debug_info = ( + f"LLM/dataset answers do not match ({conv_id}):" + f"\n'{get_short_string(output_content)}' (len: {len(output_content)})," + f"\n'{get_short_string(orig_content)}' (len: {len(orig_content)})" + ) + if orig_content != output_content: + raise ValueError(debug_info) + + # Update the answer + conversation_messages[answer_index]["content"] = output_content + else: + # A user prompt that has no answer, add the answer as a new message + new_answer = {"role": "assistant", "content": output_content} + conversation_messages.append(new_answer) + + return rs + + +async def poisson_sleep(request_rate: float, verbose: bool = False) -> None: + # Generate a random time interval from the Poisson distribution + assert request_rate > 0 + + interval = np.random.exponential(1.0 / request_rate) + if verbose: + logger.info(f"Sleeping for {interval:.3f} seconds...") + await asyncio.sleep(interval) + + +async def client_main( + args: ClientArgs, + req_args: RequestArgs, + client_id: int, + tokenizer: AutoTokenizer, + stop_event: mp.Event, # type: ignore + task_queue: mp.Queue, + result_queue: mp.Queue, + conv_queue: mp.Queue, +) -> None: + logger.info( + f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}" # noqa: E501 + ) + + random.seed(args.seed) + np.random.seed(args.seed) + + # Active conversations + active_convs: ConversationsMap = {} + conv_id_queue: deque = deque(maxlen=args.max_active_conversations) + + # Keep track of how many messages have been used for each conversation + turns_count: Counter = Counter() + num_successes = 0 + num_failures = 0 + + # Track the timestamp (time.perf_counter()) + # of the last turn per conversation (only for debug) + time_of_last_turn: dict[ConvId, float] = {} + + # Flag that indicates that there are no new tasks (conversations) for the client + task_queue_empty = False + + async with aiohttp.ClientSession() as session: + # Print progress + + while task_queue_empty is False: + result = None + + if ( + args.max_num_requests + and num_successes + num_failures == args.max_num_requests + ): + logger.info( + f"{Color.YELLOW}Client {client_id} reached " + f"request limit{Color.RESET}" + ) + break + + if stop_event.is_set(): # type: ignore + logger.info( + f"{Color.YELLOW}Client {client_id} received " + f"a termination signal{Color.RESET}" + ) + break + + while ( + len(active_convs) < args.max_active_conversations + and task_queue_empty is False + ): + # Get a new conversation from the task queue + conv_id, messages = task_queue.get() + + if conv_id is TERM_SIGNAL: + task_queue_empty = True + break + + if args.skip_first_turn: + # Skip the first turn (both user and assistant), + # relevant if warmup was enabled. + # Default turns_count[conv_id] will be zero if conv_id + # was never inserted/updated in turns_count. + turns_count[conv_id] += 2 + + if turns_count[conv_id] < len(messages): + # Add new conversation + active_convs[conv_id] = messages + conv_id_queue.append(conv_id) + + if args.verbose: + logger.info( + f"{Color.GREEN}Client {client_id} will use conversation ID {conv_id} (active conversations {len(active_convs)}){Color.RESET}" # noqa: E501 + ) + + elif args.verbose: + # No more messages (conversation finished during the warmup) + logger.info( + f"{Color.YELLOW}Client {client_id} will not use conversation ID {conv_id} (all {len(messages)} messages already sent){Color.RESET}" # noqa: E501 + ) + + if len(active_convs) == 0 or task_queue_empty: + logger.info( + f"{Color.YELLOW}Client {client_id} has no more work{Color.RESET}" + ) + break + + # Pick an active conversation for the next request + if args.conversation_sampling == ConversationSampling.ROUND_ROBIN: + conv_id = conv_id_queue.pop() + else: + # ConversationSampling.RANDOM + active_ids = list(active_convs.keys()) + conv_id = random.choice(active_ids) + + messages = active_convs[conv_id] + assert isinstance(messages, list) and len(messages) > 0 + + # Update the amount of messages to use + turns_count[conv_id] += 1 + current_turn = turns_count[conv_id] + + assert current_turn < len(messages), ( + f"Turn number {current_turn} is invalid for conversation ID {conv_id}" + f" that has only {len(messages)} messages" + ) + + if args.verbose: + curr_time_sec: float = time.perf_counter() + time_since_last_turn: Union[str, float] = "N/A" + if conv_id in time_of_last_turn: + time_since_last_turn = round( + curr_time_sec - time_of_last_turn[conv_id], 3 + ) + logger.info( + f"Client {client_id} using conversation ID {conv_id} (turn: {current_turn}, time since last turn [sec]: {time_since_last_turn})" # noqa: E501 + ) + time_of_last_turn[conv_id] = curr_time_sec + + success = True + try: + result = await send_turn( + session, + client_id, + conv_id, + messages, + current_turn, + tokenizer, + req_args, + args.print_content, + args.verify_output, + ) + if result is not None: + result_queue.put(result) + else: + # None means that the request failed, + # and should not be added to the statistics. + success = False + num_failures += 1 + + logger.warning( + f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501 + ) + + # Remove the conversation (should not be used again) + active_convs.pop(conv_id) + + except asyncio.exceptions.TimeoutError: + num_failures += 1 + logger.exception( + f"{Color.RED}Client {client_id} - Timeout during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501 + ) + break # Exit gracefully instead of raising an error + + except Exception: + num_failures += 1 + logger.exception( + f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501 + ) + break # Exit gracefully instead of raising an error + + if success: + num_successes += 1 + + # Update the turns counter to include the LLM response + # The LLM response will be used as context for the next user turn + turns_count[conv_id] += 1 + + max_turns = len(messages) + if args.max_turns is not None: + # Limit the number of turns in the conversation + max_turns = min(args.max_turns, max_turns) + + if turns_count[conv_id] >= max_turns: + # Conversation has no more turns (no longer active) + # save the updated conversation (with the LLM server's answer) + conv_queue.put((conv_id, active_convs.pop(conv_id))) + if args.verbose: + logger.info( + f"{Color.GREEN}Client {client_id} finished " + f"conversation ID {conv_id}{Color.RESET}" + ) + else: + # Conversation is not finished, insert it at the back of the queue + conv_id_queue.appendleft(conv_id) + + # Sleep between requests (if lambda is positive) + if args.request_rate > 0: + await poisson_sleep(args.request_rate, args.verbose) + + # Send indication that the client is done + conv_queue.put((TERM_SIGNAL, TERM_SIGNAL)) + + logger.info( + f"{Color.CYAN}Client {client_id} is done " + f"({num_successes=}, {num_failures=}){Color.RESET}" + ) + + +def worker_function( + client_id: int, + tokenizer: AutoTokenizer, + client_args: ClientArgs, + req_args: RequestArgs, + stop_event: mp.Event, # type: ignore + task_queue: mp.Queue, + result_queue: mp.Queue, + conv_queue: mp.Queue, +) -> None: + asyncio.run( + client_main( + client_args, + req_args, + client_id, + tokenizer, + stop_event, + task_queue, + result_queue, + conv_queue, + ) + ) + + +def get_client_config( + args: argparse.Namespace, input_conv: ConversationsMap +) -> tuple[ClientArgs, RequestArgs]: + if args.num_clients < 1: + raise ValueError("Number of clients must be a positive number") + + if len(input_conv) < args.num_clients: + raise ValueError( + "Number of conversations must be equal or larger than the number of clients" + ) + + max_req_per_client: Optional[int] = None + if args.max_num_requests is not None: + # Max number of requests per client + req_per_client = args.max_num_requests // args.num_clients + if req_per_client < 1: + raise ValueError("Number of requests should be at least one per client") + max_req_per_client = req_per_client + + max_active_conversations = args.max_active_conversations + if max_active_conversations is None: + # Each client will have only one active conversation at a time + max_active_conversations = args.num_clients + + if max_active_conversations > len(input_conv): + raise ValueError( + f"Max active conversations {max_active_conversations} " + "must be equal or less than the total number of conversations" + ) + + # Max number of active conversations per client + max_active_conv_per_client = max_active_conversations // args.num_clients + if max_active_conv_per_client < 1: + raise ValueError( + f"Max active conversations {max_active_conversations} " + "must be equal or greater than the number of clients" + ) + + # Skip the first user turn (as part of the warmup) + skip_first_turn = args.warmup_step + + # Common arguments for all clients + client_args = ClientArgs( + seed=args.seed, + max_num_requests=max_req_per_client, + skip_first_turn=skip_first_turn, + max_turns=args.max_turns, + max_active_conversations=max_active_conv_per_client, + verbose=args.verbose, + print_content=args.print_content, + verify_output=args.verify_output, + conversation_sampling=args.conversation_sampling, + request_rate=args.request_rate, + ) + + if args.limit_min_tokens > 0 or args.limit_max_tokens > 0: + if args.limit_min_tokens < 1 or args.limit_max_tokens < 1: + raise ValueError( + "Invalid min/max tokens limits (both limits should be provided)" + ) + if args.limit_min_tokens > args.limit_max_tokens: + raise ValueError( + "Invalid min/max tokens limits (min should not be larger than max)" + ) + + # Arguments for API requests + chat_url = f"{args.url}/v1/chat/completions" + req_args = RequestArgs( + chat_url=chat_url, + model=args.model, + stream=not args.no_stream, + limit_min_tokens=args.limit_min_tokens, + limit_max_tokens=args.limit_max_tokens, + ) + + return client_args, req_args + + +async def main_mp( + client_args: ClientArgs, + req_args: RequestArgs, + bench_args: BenchmarkArgs, + tokenizer: AutoTokenizer, + input_conv: ConversationsMap, +) -> tuple[ConversationsMap, list[RequestStats]]: + # An event that will trigger graceful termination of all the clients + stop_event = mp.Event() + + # Queue for input conversations (from the input file/dataset) + task_queue: mp.Queue = mp.Queue() + + # Queue for client measurements (TTFT, TPOT, etc. for each request) + result_queue: mp.Queue = mp.Queue() + + # Queue for output conversations (with the LLM answers, sent by the server) + conv_queue: mp.Queue = mp.Queue() + output_conv: ConversationsMap = {} + client_metrics: list[RequestStats] = [] + + # Start all clients + start_time = time.perf_counter_ns() + logger.info(f"{Color.GREEN}Starting {bench_args.num_clients} clients{Color.RESET}") + + clients = [] + for client_id in range(bench_args.num_clients): + client = mp.Process( + name=f"client_{client_id}", + target=worker_function, + args=( + client_id, + tokenizer, + client_args, + req_args, + stop_event, + task_queue, + result_queue, + conv_queue, + ), + ) + clients.append(client) + client.start() + + # Submit all the input conversations as tasks for the clients + for conv_id, messages in input_conv.items(): + task_queue.put((conv_id, messages)) + + # Add termination signals for clients + for _ in range(bench_args.num_clients): + task_queue.put((TERM_SIGNAL, TERM_SIGNAL)) + + # Collect the updated conversations from all clients + num_clients_finished = 0 + total_convs = len(input_conv) + + debug_stats = DebugStats(logger, min(15 * bench_args.num_clients, 500)) + + while num_clients_finished < bench_args.num_clients: + # Collect updated conversation + conv_id, messages = conv_queue.get() + + # Collect results (measurements) + while not result_queue.empty(): + new_data = result_queue.get() + client_metrics.append(new_data) + debug_stats.update(new_data) + + if conv_id is TERM_SIGNAL: + num_clients_finished += 1 + logger.info( + f"{Color.CYAN}{num_clients_finished} out of " + f"{bench_args.num_clients} clients finished{Color.RESET}" + ) + + if bench_args.early_stop and not stop_event.is_set(): + # Once one client finished, stop all other clients. + # there is no reason to continue the benchmark with fewer clients. + logger.info( + f"{Color.YELLOW}Sending termination signal to clients{Color.RESET}" + ) + stop_event.set() + else: + output_conv[conv_id] = messages + + finished_convs = len(output_conv) + percent = finished_convs / total_convs + + # Tuned to control the print rate (can be changed if required) + print_cycle = max(3, int(bench_args.num_clients / 4)) + + if finished_convs % print_cycle == 0: + runtime_sec = nanosec_to_sec(time.perf_counter_ns() - start_time) + logger.info( + f"{Color.CYAN}Finished {finished_convs} out of {total_convs} conversations ({percent:.0%}), " # noqa: E501 + f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501 + ) + + rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3) + if len(client_metrics) < (5 * bench_args.num_clients): + # Do not estimate the RPS if the number of samples is very low + # (threshold can be tuned if needed) + rps = "N/A" + + runtime_left_sec: Union[str, float] = round( + (runtime_sec / finished_convs) * (total_convs - finished_convs), 3 + ) + if percent < 0.05: + # If less than 5% of the conversations were not finished, + # the estimation will probably be very inaccurate + # (threshold can be tuned if needed). + runtime_left_sec = "N/A" + + logger.info( + f"{Color.CYAN}Estimated req/sec {rps}, estimated runtime left {runtime_left_sec} sec{Color.RESET}" # noqa: E501 + ) + debug_stats.print() + + logger.info( + f"{Color.CYAN}All {bench_args.num_clients} clients finished{Color.RESET}" + ) + + # At this point all the clients finished, + # collect results (TTFT, TPOT, etc.) from all the clients. + # This needs to happens before calling join on the clients + # (result_queue should be emptied). + while not result_queue.empty(): + client_metrics.append(result_queue.get()) + + logger.info(f"Collected {len(client_metrics)} samples from all the clients") + + # Wait for all clients to finish + for client in clients: + logger.info( + f"{Color.CYAN}Waiting for client {client.name} " + f"(is alive: {client.is_alive()}){Color.RESET}" + ) + + client.join(timeout=120) + + if client.is_alive(): + logger.warning( + f"{Color.YELLOW}Client {client.name} will be terminated{Color.RESET}" + ) + client.terminate() + + exitcode = client.exitcode + if exitcode != 0: + logger.error( + f"{Color.RED}Client {client.name} exited " + f"with exit code {exitcode}{Color.RESET}" + ) + + logger.info( + f"All {bench_args.num_clients} clients exited (successfully " + f"finished {len(output_conv)} out of {total_convs} conversations)" + ) + + # Queues should be closed, required to avoid hang at interpreter shutdown + unfinished_tasks = 0 + while not task_queue.empty(): + task_queue.get() + unfinished_tasks += 1 + + if unfinished_tasks > 0: + # Can happen if not all tasks (conversations) have finished. + # May happen if --max-num-requests was used, + # or if an error occurred in one of the clients. + logger.debug(f"Discarding {unfinished_tasks} unfinished tasks") + + task_queue.close() + task_queue.join_thread() + + result_queue.close() + result_queue.join_thread() + + conv_queue.close() + conv_queue.join_thread() + + return output_conv, client_metrics + + +def get_filename_with_timestamp(label: str, extension: str) -> str: + time_now = datetime.now() + timestamp = time_now.strftime("%d-%m-%Y_%H-%M-%S") + filename = f"{label}__{timestamp}.{extension}" + return filename + + +def process_statistics( + client_metrics: list[RequestStats], + warmup_percentages: list[float], + test_params: dict, + verbose: bool, + gen_conv_args: Optional[GenConvArgs] = None, + excel_output: bool = False, +) -> None: + if len(client_metrics) == 0: + logger.info("No samples to process") + return + + logger.info(f"Processing {len(client_metrics)} samples...") + + raw_data = pd.DataFrame(client_metrics) + + if verbose: + # Calculate the time between user turns in each conversation (in a new column) + raw_data = raw_data.sort_values(by=["conversation_id", "start_time_ms"]) + raw_data["time_between_user_turns_sec"] = raw_data.groupby("conversation_id")[ + "start_time_ms" + ].diff() + + # Convert milliseconds to seconds + raw_data["time_between_user_turns_sec"] = ( + raw_data["time_between_user_turns_sec"] / 1000.0 + ) + + # Final raw data should be sorted by time + raw_data = raw_data.sort_values(by=["start_time_ms"]) + raw_data["end_time_ms"] = raw_data["start_time_ms"] + raw_data["latency_ms"] + + percentiles = [0.25, 0.5, 0.75, 0.9] + + # Add more percentiles if there are enough samples + if len(raw_data) >= 100: + percentiles.append(0.99) + + if len(raw_data) >= 1000: + percentiles.append(0.999) + + if len(raw_data) >= 10000: + percentiles.append(0.9999) + + # Set precision for numbers in the output text (the dataframes) + pd.set_option("display.precision", 2) + + # Exclude parameters from RequestStats + exclude = [ + "start_time_ms", + "end_time_ms", + "output_num_first_chunk_tokens", + "approx_cached_percent", + "conversation_id", + "client_id", + ] + + print(TEXT_SEPARATOR) + print(f"{Color.YELLOW}Parameters:{Color.RESET}") + for k, v in test_params.items(): + print(f"{k}={v}") + + # conversations generation parameters + if gen_conv_args is not None: + gen_params = { + "text_files": ", ".join(gen_conv_args.text_files), + "input_num_turns": str(gen_conv_args.input_num_turns), + "input_common_prefix_num_tokens": str( + gen_conv_args.input_common_prefix_num_tokens + ), + "input_prefix_num_tokens": str(gen_conv_args.input_prefix_num_tokens), + "input_num_tokens": str(gen_conv_args.input_num_tokens), + "output_num_tokens": str(gen_conv_args.output_num_tokens), + } + + print(f"{Color.YELLOW}Conversations Generation Parameters:{Color.RESET}") + for k, v in gen_params.items(): + print(f"{k}={v}") + + print(TEXT_SEPARATOR) + + params_list = [] + df_list = [] + for percent in warmup_percentages: + # Select samples from the end (tail) of the dataframe + warmup_count = int(percent * len(raw_data)) + tail_count = len(raw_data) - warmup_count + if tail_count == 0: + # No reason to process if the count of samples is zero + break + + df = raw_data.tail(tail_count) + + # Runtime is the diff between the end of the last request + # and the start of the first request + runtime_sec = df["end_time_ms"].iloc[-1] - df["start_time_ms"].iloc[0] + + # Convert milliseconds to seconds + runtime_sec = runtime_sec / 1000.0 + requests_per_sec = float(len(df)) / runtime_sec + + params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec} + + # Generate a summary of relevant metrics (and drop irrelevant data) + df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose() + + # List for Excel file + params_list.append(params) + df_list.append(df) + + # Print the statistics summary + if percent > 0 or len(warmup_percentages) > 1: + print( + f"{Color.YELLOW}Statistics summary " + f"(assuming {percent:.0%} warmup samples):{Color.RESET}" + ) + else: + print(f"{Color.YELLOW}Statistics summary:{Color.RESET}") + + for k, v in params.items(): + if isinstance(v, float): + print(f"{k} = {v:.3f}") + else: + print(f"{k} = {v}") + print(TEXT_SEPARATOR) + print(df) + print(TEXT_SEPARATOR) + + if excel_output: + prefix = f"statistics_{test_params['num_clients']}_clients" + filename = get_filename_with_timestamp(prefix, "xlsx") + + with pd.ExcelWriter(filename, engine="xlsxwriter") as writer: + startrow = 0 + test_params_df = pd.DataFrame([test_params]) + test_params_df.to_excel( + writer, sheet_name="Summary", index=False, startrow=startrow + ) + startrow += len(test_params_df) + 3 + + if gen_conv_args is not None: + gen_params_df = pd.DataFrame([gen_params]) + gen_params_df.to_excel( + writer, sheet_name="Summary", index=False, startrow=(startrow - 1) + ) + startrow += len(gen_params_df) + 3 + + for params, df_stats in zip(params_list, df_list): + df_params = pd.DataFrame([params]) + df_params.to_excel( + writer, sheet_name="Summary", index=False, startrow=startrow + ) + startrow += len(df_params) + 2 + df_stats.to_excel( + writer, sheet_name="Summary", index=True, startrow=startrow + ) + startrow += len(df_stats) + 3 + + raw_data.to_excel(writer, sheet_name="Raw data", index=False, startrow=0) + + logger.info( + f"{Color.GREEN}Client metrics exported to file: {filename}{Color.RESET}" + ) + + +async def get_server_info(url: str) -> None: + logger.info(f"{Color.BLUE}Collecting information from server: {url}{Color.RESET}") + async with aiohttp.ClientSession() as session: + # Get server version (not mandatory, "version" endpoint may not exist) + url_version = f"{url}/version" + async with session.get(url_version) as response: + if HTTPStatus(response.status) == HTTPStatus.OK: + text = await response.text() + logger.info(f"{Color.BLUE}Server version: {text}{Color.RESET}") + + # Get available models + url_models = f"{url}/v1/models" + async with session.get(url_models) as response: + if HTTPStatus(response.status) == HTTPStatus.OK: + text = await response.text() + logger.info(f"{Color.BLUE}Models:{Color.RESET}") + models_data = json.loads(text) + models_list = models_data["data"] + for model in models_list: + model_id = model["id"] + max_model_len = model.get("max_model_len", "N/A") + logger.info( + f"{Color.BLUE}\t{model_id=}, {max_model_len=}{Color.RESET}" + ) + else: + logger.info(f"{Color.RED}Failed to get models{Color.RESET}") + + +async def main() -> None: + parser = argparse.ArgumentParser( + prog="Benchmark serving with multi-turn conversations", + description="Benchmark online inference using REST API", + ) + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + + parser.add_argument( + "-i", + "--input-file", + type=str, + required=True, + help="Input JSON file with ShareGPT conversations or " + "configuration file for generation of synthetic conversations", + ) + parser.add_argument( + "-o", + "--output-file", + type=str, + default=None, + help="Output JSON file containing conversations with updated assistant answers", + ) + + parser.add_argument( + "--seed", + type=int, + default=0, + help="Seed for random number generators (default: 0)", + ) + parser.add_argument( + "-m", "--model", type=str, required=True, help="Path of the LLM model" + ) + parser.add_argument( + "-u", + "--url", + type=str, + default="http://localhost:8000", + help="Base URL for the LLM API server", + ) + + parser.add_argument( + "-p", + "--num-clients", + type=int, + default=1, + help="Number of clients that will send requests in parallel", + ) + parser.add_argument( + "-k", + "--max-active-conversations", + type=int, + default=None, + help="Max number of active conversations at a time (for all clients)", + ) + parser.add_argument( + "-n", + "--max-num-requests", + type=int, + default=None, + help="Max number of requests to send (total for all clients)", + ) + + parser.add_argument( + "--warmup-step", + default=False, + action="store_true", + help="Run a warmup step (using only the first turn of every conversation), " + "measurements will not be included in the final benchmark results", + ) + + parser.add_argument( + "--max-turns", + type=int, + default=None, + help="Maximum number of turns/messages per conversation, " + "includes both user and assistant messages " + "(a positive number, e.g: 2, 4, 6, etc.), disabled by default", + ) + parser.add_argument( + "--no-early-stop", + default=False, + action="store_true", + help="By default, the benchmark will stop if at least one client exits." + " Use this flag to disable this behavior", + ) + + parser.add_argument( + "--limit-max-tokens", + type=int, + default=NUM_TOKENS_FROM_DATASET, + help="Set max_tokens for the output token count of each request " + "(must also set --limit-min-tokens). " + "Overrides output token count from the input dataset. " + "Use a negative value to disable this limit.", + ) + parser.add_argument( + "--limit-min-tokens", + type=int, + default=NUM_TOKENS_FROM_DATASET, + help="Set min_tokens for the output token count of each request " + "(must also set --limit-max-tokens). " + "Overrides output token count from the input dataset. " + "Use a negative value to disable this limit.", + ) + + parser.add_argument( + "--request-rate", + type=float, + default=0, + help="Expected request rate (Poisson process) per client in requests/sec." + "Set to 0 for no delay between requests.", + ) + parser.add_argument( + "--conversation-sampling", + type=ConversationSampling, + choices=list(ConversationSampling), + default=ConversationSampling.ROUND_ROBIN, + help=( + "Strategy for selecting which conversation to use for the next request. " + "Options: 'round_robin' (cycle through conversations), " + "'random' (pick randomly)." + ), + ) + parser.add_argument( + "--verify-output", + default=False, + action="store_true", + help="Verify the LLM output (compare to the answers in the input JSON file)", + ) + + parser.add_argument( + "--no-stream", + default=False, + action="store_true", + help="Disable stream/streaming mode (set 'stream' to False in the API request)", + ) + + parser.add_argument( + "-e", + "--excel-output", + default=False, + action="store_true", + help="Export summary to Excel file (optional)", + ) + parser.add_argument( + "-v", + "--verbose", + default=False, + action="store_true", + help="Enable verbose output", + ) + parser.add_argument( + "--print-content", + default=False, + action="store_true", + help="Print the user prompts and the server's answers", + ) + + parser.add_argument( + "--warmup-percentages", + type=str, + default="0%", + help="Ignore the first X samples as warmup (X is a percentage)." + " A comma separated list of percentages can be used " + "(for example: --warmup-percentages=0%%,50%%)", + ) + + args = parser.parse_args() + + logger.info(args) + + logger.info(f"{Color.GREEN}Input parameters:{Color.RESET}") + logger.info(f"url={args.url}") + logger.info(f"model={args.model}") + logger.info(f"num_clients={args.num_clients}") + + if args.verify_output: + logger.info(f"{Color.PURPLE}Verify is enabled{Color.RESET}") + + # Calculate the amount of samples to filter (as warmup samples/measurements). + try: + warmup_percentages: list[float] = [0.0] + if not args.warmup_step: + # Warmup percentage can be used only if the warmup step was used + warmup_strings: list[str] = args.warmup_percentages.split(",") + warmup_strings = [x.replace("%", "") for x in warmup_strings] + warmup_percentages = [float(x) / 100 for x in warmup_strings] + + # Check for valid range (0 to 1) + for p in warmup_percentages: + assert p >= 0.0 and p < 1.0 + + # Sort from high to low warmup percentage + warmup_percentages.sort() + + logger.info( + f"Warmup percentages (percentage of samples): {warmup_percentages}" + ) + + except Exception: + raise ValueError( + f"Invalid --warmup-percentage={args.warmup_percentage}" + ) from None + + random.seed(args.seed) + np.random.seed(args.seed) + + if not os.path.exists(args.model): + raise OSError(f"Path does not exist: {args.model}") + logger.info("Loading tokenizer") + tokenizer = AutoTokenizer.from_pretrained(args.model) + + await get_server_info(args.url) + + # Load the input file (either conversations of configuration file) + logger.info(f"Reading input file: {args.input_file}") + with open(args.input_file) as f: + input_data = json.load(f) + + gen_conv_args = None + if isinstance(input_data, list): + # The conversations are stored as a list of dicts + logger.info(f"Found {len(input_data)} items in the input file") + + # Convert the list to a ConversationsMap + conversations = conversations_list_to_dict(input_data) + + elif isinstance(input_data, dict): + # The input file is a configuration file + # (type is determined by the field 'filetype') + if "filetype" not in input_data: + raise Exception( + f"Input file {args.input_file} is invalid (missing 'filetype')" + ) + + logger.info(f"Using input file with filetype: {input_data['filetype']}") + + gen_conv_args = parse_input_json_file(input_data) + + # Disable warning from "huggingface/tokenizers" + # (when using python multiprocessing and tokenizers) + os.environ["TOKENIZERS_PARALLELISM"] = "true" + + # Generate synthetic conversations + conversations = generate_conversations(gen_conv_args, tokenizer) + + else: + raise Exception(f"Input file {args.input_file} is invalid") + + if args.max_turns is not None: + if args.max_turns < 1: + raise ValueError("Max turns must be a positive number") + logger.info( + f"{Color.PURPLE}Max turns per conversation " + f"is limited to {args.max_turns}{Color.RESET}" + ) + + # Create benchmark configurations + client_args, req_args = get_client_config(args, conversations) + + bench_args = BenchmarkArgs( + url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop + ) + + # Warm-up step + if args.warmup_step: + # Only send a single user prompt from every conversation. + # max_active_conversations must be 1, + # otherwise the clients may exit after sending a single request + # (because the task queue is empty). + warmup_client_args = client_args._replace( + skip_first_turn=False, max_turns=1, max_active_conversations=1 + ) + + # Early stop should be disabled, + # all clients should finish their work before exiting + warmup_bench_args = bench_args._replace(early_stop=False) + + logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}") + conversations, _ = await main_mp( + warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations + ) + logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}") + + # Run the benchmark + start_time = time.perf_counter_ns() + client_convs, client_metrics = await main_mp( + client_args, req_args, bench_args, tokenizer, conversations + ) + total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time) + + # Calculate requests per second + total_runtime_sec = total_runtime_ms / 1000.0 + rps = len(client_metrics) / total_runtime_sec + logger.info( + f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec" + f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}" + ) + + # Benchmark parameters + params = { + "model": args.model, + "num_clients": args.num_clients, + "num_conversations": len(conversations), + "active_conversations": args.max_active_conversations, + "seed": args.seed, + } + + if args.limit_min_tokens > 0: + params["min_tokens"] = args.limit_min_tokens + + if args.limit_max_tokens > 0: + params["max_tokens"] = args.limit_max_tokens + + # Process and print statistics (and save excel file with the statistics) + process_statistics( + client_metrics, + test_params=params, + warmup_percentages=warmup_percentages, + verbose=args.verbose, + gen_conv_args=gen_conv_args, + excel_output=args.excel_output, + ) + + if args.output_file is not None: + # Write a JSON file with the updated conversations + # The "assistant" content will contain the answers from the tested LLM + output_data: ShareGptConversations = conversations_dict_to_list(client_convs) + logger.info( + f"{Color.GREEN}Writing conversations file: {args.output_file}{Color.RESET}" + ) + with open(args.output_file, "w") as f: + json.dump(output_data, f, indent=4) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/benchmarks/multi_turn/convert_sharegpt_to_openai.py b/benchmarks/multi_turn/convert_sharegpt_to_openai.py new file mode 100644 index 0000000000..c3622c99a2 --- /dev/null +++ b/benchmarks/multi_turn/convert_sharegpt_to_openai.py @@ -0,0 +1,354 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Download dataset from: +https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json + +Convert to OpenAI API: +export INPUT_FILE=sharegpt_20230401_clean_lang_split.json +python convert_sharegpt_to_openai.py $INPUT_FILE sharegpt_conv_128.json --max-items=128 +""" + +import argparse +import json +import random +from statistics import mean +from typing import Any, Optional + +import pandas as pd # type: ignore +import tqdm # type: ignore +from transformers import AutoTokenizer # type: ignore + + +def has_non_english_chars(text: str) -> bool: + return not text.isascii() + + +def content_is_valid( + content: str, min_content_len: Optional[int], max_content_len: Optional[int] +) -> bool: + if min_content_len and len(content) < min_content_len: + return False + + if max_content_len and len(content) > max_content_len: + return False + + return has_non_english_chars(content) + + +def print_stats( + conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None +) -> None: + # Collect statistics + stats = [] + + print("\nCollecting statistics...") + for item in tqdm.tqdm(conversations): + # item has "id" and "messages" + messages = item["messages"] + + user_turns = 0 + assistant_turns = 0 + user_words = 0 + assistant_words = 0 + conv_chars = 0 + + user_tokens: list[int] = [] + assistant_tokens: list[int] = [] + + for m in messages: + content = m["content"] + conv_chars += len(content) + content_num_words = content.count(" ") + 1 + + num_tokens = 0 + if tokenizer: + num_tokens = len(tokenizer(m["content"]).input_ids) + + if m["role"] == "user": + user_turns += 1 + user_words += content_num_words + if tokenizer: + user_tokens.append(num_tokens) + + elif m["role"] == "assistant": + assistant_turns += 1 + assistant_words += content_num_words + if tokenizer: + assistant_tokens.append(num_tokens) + + # assert user_turns == assistant_turns, \ + # f"Invalid conversation ID {item['id']}" + + conv_words = user_words + assistant_words + item_stats = { + "user_turns": user_turns, + "assistant_turns": assistant_turns, + "user_words": user_words, + "assistant_words": assistant_words, + "conv_turns": len(messages), + "conv_words": conv_words, + "conv_characters": conv_chars, + } + + if len(user_tokens) > 0: + item_stats["user_tokens"] = int(mean(user_tokens)) + + if len(assistant_tokens) > 0: + item_stats["assistant_tokens"] = int(mean(assistant_tokens)) + + stats.append(item_stats) + + print("\nStatistics:") + percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999] + df = pd.DataFrame(stats) + print(df.describe(percentiles=percentiles).transpose()) + + +def convert_sharegpt_to_openai( + seed: int, + input_file: str, + output_file: str, + max_items: Optional[int], + min_content_len: Optional[int] = None, + max_content_len: Optional[int] = None, + min_turns: Optional[int] = None, + max_turns: Optional[int] = None, + model: Optional[str] = None, +) -> None: + if min_turns and max_turns: + assert min_turns <= max_turns + + if min_content_len and max_content_len: + # Verify that min is not larger than max if both were given + assert min_content_len <= max_content_len + + print( + f"Input parameters:\n{seed=}, {max_items=}, {min_content_len=}," + f" {max_content_len=}, {min_turns=}, {max_turns=}\n" + ) + + random.seed(seed) + + tokenizer = None + if model is not None: + print(f"Loading tokenizer from: {model}") + tokenizer = AutoTokenizer.from_pretrained(model) + + # Read the ShareGPT JSON file + print(f"Reading file: {input_file}") + with open(input_file, encoding="utf-8") as f: + # Should be a list of dicts + # Each dict should have "id" (string) and "conversations" (list of dicts) + sharegpt_data = json.load(f) + + assert isinstance(sharegpt_data, list), "Input file should contain a list of dicts" + + print(f"Total items in input file: {len(sharegpt_data):,}") + + print(f"Shuffling dataset with seed {seed}") + random.shuffle(sharegpt_data) + + # Map conversation ID to the all the messages + conversation_parts: dict[str, list[Any]] = {} + + for item in tqdm.tqdm(sharegpt_data): + assert "id" in item, "Missing key 'id'" + assert "conversations" in item, "Missing key 'conversations'" + + # Conversation ID (e.g: "hiWPlMD") and part/session (0, 1, 2, etc.) + conv_id, _ = item["id"].split("_") + new_turns = item["conversations"] + + if conv_id not in conversation_parts: + # Start new conversation + conversation_parts[conv_id] = [] + elif len(conversation_parts[conv_id]) > 0 and len(new_turns) > 0: + prev_turns = conversation_parts[conv_id][-1] + if prev_turns[-1]["from"] == new_turns[0]["from"]: + new_turns = new_turns[1:] + + if len(new_turns) > 0: + # We assume that parts are in order in the ShareGPT dataset + conversation_parts[conv_id].append(new_turns) + + dataset: list[dict[str, Any]] = [] + for conv_id, conv_parts in conversation_parts.items(): + new_item = {"id": conv_id} + + conversations: list[dict[str, str]] = [] + + # Merge all parts + for conv_part in conv_parts: + conversations.extend(conv_part) + + if len(conversations) > 0: + new_item["conversations"] = conversations + dataset.append(new_item) + + print(f"Total unique conversations (IDs) in input file: {len(dataset):,}") + + # Final output data + final_openai_dataset: list[dict] = [] + + # Filter conversations from the ShareGPT dataset and convert to OpenAI format + for item in tqdm.tqdm(dataset): + messages: list[dict] = [] + + assert "id" in item, "Missing key 'id'" + assert "conversations" in item, "Missing key 'conversations'" + + conv_id = item["id"] + conversations = item["conversations"] + + if min_turns is not None and len(conversations) < min_turns: + # Skip short conversations + continue + + # Convert each message in the conversation, up to max_turns if specified + for i, turn in enumerate(conversations): + assert "from" in turn and "value" in turn, ( + f"Invalid conversation ID {conv_id} - missing 'from' or 'value'" + ) + + role = None + turn_from = turn["from"] + + if turn_from in {"human", "user"}: + role = "user" + elif turn_from in {"gpt", "bing", "chatgpt", "bard"}: + role = "assistant" + elif turn_from == "system": + role = "system" + + assert role is not None, ( + f"Invalid conversation ID {conv_id} - 'from'='{turn_from}' is invalid" + ) + + if i == 0 and role != "user": + # If the first message is from assistant (gpt), skip it. + # this happens when the conversation is a follow-up + # to a previous conversation (from the same user). + continue + + if max_turns is not None and i >= max_turns: + break + + # Convert message to OpenAI format (with "role" and "content") + content = turn["value"] + messages.append({"role": role, "content": content}) + + # Add the converted conversation to the OpenAI format + if len(messages) > 0: + valid_messages = True + + # First turn should always be from the user + user_turn = True + + for m in messages: + # Make sure that turns alternate between user and assistant + if (user_turn and m["role"] != "user") or ( + not user_turn and m["role"] != "assistant" + ): + valid_messages = False + break + + user_turn = not user_turn + + content = m["content"] + valid_messages = content_is_valid( + content, min_content_len, max_content_len + ) + if not valid_messages: + break + + if valid_messages is True: + final_openai_dataset.append({"id": conv_id, "messages": messages}) + + assert len(final_openai_dataset) > 0, "Final number of conversations is zero" + + print_stats(final_openai_dataset) + + print_stats_again = False + if max_items is not None and len(final_openai_dataset) > max_items: + print(f"\n\nSampling {max_items} items from the dataset...") + print_stats_again = True + final_openai_dataset = random.sample(final_openai_dataset, max_items) + + if print_stats_again: + # Print stats after the dataset changed + print_stats(final_openai_dataset, tokenizer) + + # Write the converted data to a new JSON file + final_size = len(final_openai_dataset) + print(f"\nTotal conversations converted (after filtering): {final_size:,}") + print(f"\nWriting file: {output_file}") + with open(output_file, "w", encoding="utf-8") as f: + json.dump(final_openai_dataset, f, ensure_ascii=False, indent=2) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Convert ShareGPT dataset to OpenAI API format" + ) + parser.add_argument("input_file", help="Path to the input ShareGPT JSON file") + parser.add_argument( + "output_file", help="Path to the output OpenAI format JSON file" + ) + parser.add_argument( + "--seed", type=int, default=0, help="Seed for random number generators" + ) + parser.add_argument( + "--max-items", + type=int, + default=None, + help="Maximum number of items in the output file", + ) + parser.add_argument( + "--min-turns", + type=int, + default=None, + help="Minimum number of turns per conversation", + ) + parser.add_argument( + "--max-turns", + type=int, + default=None, + help="Maximum number of turns per conversation", + ) + parser.add_argument( + "--min-content-len", + type=int, + default=None, + help="Min number of characters in the messages' content", + ) + parser.add_argument( + "--max-content-len", + type=int, + default=None, + help="Max number of characters in the messages' content", + ) + parser.add_argument( + "--model", + type=str, + default=None, + help="LLM model, only the tokenizer will be used", + ) + + args = parser.parse_args() + + convert_sharegpt_to_openai( + args.seed, + args.input_file, + args.output_file, + args.max_items, + args.min_content_len, + args.max_content_len, + args.min_turns, + args.max_turns, + args.model, + ) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/multi_turn/generate_multi_turn.json b/benchmarks/multi_turn/generate_multi_turn.json new file mode 100644 index 0000000000..274d03c2bd --- /dev/null +++ b/benchmarks/multi_turn/generate_multi_turn.json @@ -0,0 +1,35 @@ +{ + "filetype": "generate_conversations", + "num_conversations": 24, + "text_files": ["pg1184.txt"], + "print_stats": false, + "prompt_input": { + "num_turns": { + "distribution": "uniform", + "min": 12, + "max": 18 + }, + "common_prefix_num_tokens": { + "distribution": "constant", + "value": 500 + }, + "prefix_num_tokens": { + "distribution": "lognormal", + "mean": 6, + "sigma": 4, + "max": 1500 + }, + "num_tokens": { + "distribution": "uniform", + "min": 120, + "max": 160 + } + }, + "prompt_output": { + "num_tokens": { + "distribution": "uniform", + "min": 80, + "max": 120 + } + } +} \ No newline at end of file diff --git a/benchmarks/multi_turn/requirements.txt b/benchmarks/multi_turn/requirements.txt new file mode 100644 index 0000000000..f0e1935914 --- /dev/null +++ b/benchmarks/multi_turn/requirements.txt @@ -0,0 +1,5 @@ +numpy>=1.24 +pandas>=2.0.0 +aiohttp>=3.10 +transformers>=4.46 +xlsxwriter>=3.2.1 \ No newline at end of file From f756a682d96ba1824b6a759017f9d27a7f5f0182 Mon Sep 17 00:00:00 2001 From: Yongye Zhu Date: Fri, 8 Aug 2025 11:18:33 -0700 Subject: [PATCH 094/932] [gpt-oss] guard import when triton kernel is not installed (#22529) Signed-off-by: Yongye Zhu Signed-off-by: Woosuk Kwon Co-authored-by: Woosuk Kwon --- .../fused_moe/gpt_oss_triton_kernels_moe.py | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 4482029c16..6d6a2e22bc 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import TYPE_CHECKING, Any, Optional import torch @@ -8,13 +8,16 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate) from vllm.model_executor.layers.fused_moe.utils import extract_required_args +from vllm.utils import has_triton_kernels -if True: +if has_triton_kernels(): import triton_kernels.swiglu - from triton_kernels.matmul_ogs import (FnSpecs, FusedActivation, - PrecisionConfig, matmul_ogs) + from triton_kernels.matmul_ogs import FnSpecs, FusedActivation, matmul_ogs from triton_kernels.routing import routing +if TYPE_CHECKING: + from triton_kernels.matmul_ogs import PrecisionConfig + def triton_kernel_moe_forward( hidden_states: torch.Tensor, @@ -33,8 +36,8 @@ def triton_kernel_moe_forward( w2_scale: Optional[torch.Tensor] = None, w1_bias: Optional[torch.Tensor] = None, w2_bias: Optional[torch.Tensor] = None, - w1_precision=None, # PrecisionConfig or None - w2_precision=None, # PrecisionConfig or None + w1_precision: Optional["PrecisionConfig"] = None, + w2_precision: Optional["PrecisionConfig"] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[list[int]] = None, @@ -90,8 +93,8 @@ def triton_kernel_fused_experts( w2_scale: Optional[torch.Tensor] = None, w1_bias: Optional[torch.Tensor] = None, w2_bias: Optional[torch.Tensor] = None, - w1_precision=None, # PrecisionConfig or None - w2_precision=None, # PrecisionConfig or None + w1_precision: Optional["PrecisionConfig"] = None, + w2_precision: Optional["PrecisionConfig"] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[list[int]] = None, @@ -141,8 +144,14 @@ def triton_kernel_fused_experts( class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): - def __init__(self, quant_config, max_num_tokens: int, num_dispatchers: int, - w1_precision: PrecisionConfig, w2_precision: PrecisionConfig): + def __init__( + self, + quant_config, + max_num_tokens: int, + num_dispatchers: int, + w1_precision: "PrecisionConfig", + w2_precision: "PrecisionConfig", + ): super().__init__(quant_config) self.max_num_tokens = max_num_tokens self.num_dispatchers = num_dispatchers From e29059407251c071a75b1b1d89471326add28b90 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Fri, 8 Aug 2025 12:26:21 -0700 Subject: [PATCH 095/932] =?UTF-8?q?[Docs]=20Rename=20=E2=80=9CDistributed?= =?UTF-8?q?=20inference=20and=20serving=E2=80=9D=20to=20=E2=80=9CParalleli?= =?UTF-8?q?sm=20&=20Scaling=E2=80=9D=20(#22466)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ricardo Decal --- docs/models/supported_models.md | 20 +++++++++---------- ...uted_serving.md => parallelism_scaling.md} | 2 +- docs/usage/troubleshooting.md | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) rename docs/serving/{distributed_serving.md => parallelism_scaling.md} (99%) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 265643a441..b79650444a 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -320,7 +320,7 @@ th { } -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ | ✅︎ | @@ -426,7 +426,7 @@ See [this page](./pooling_models.md) for more information on how to use pooling These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `BertModel`C | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | | | `Gemma2Model`C | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ | @@ -466,7 +466,7 @@ of the whole prompt are extracted from the normalized hidden state corresponding These models primarily support the [`LLM.classify`](./pooling_models.md#llmclassify) API. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | | | `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ | @@ -483,7 +483,7 @@ If your model is not in the above list, we will try to automatically convert the Cross-encoder and reranker models are a subset of classification models that accept two prompts as input. These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | | | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | @@ -521,7 +521,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward) API. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ | | `LlamaForCausalLM`C | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ | @@ -594,7 +594,7 @@ See [this page](generative_models.md) for more information on how to use generat These models primarily accept the [`LLM.generate`](./generative_models.md#llmgenerate) API. Chat/Instruct models additionally support the [`LLM.chat`](./generative_models.md#llmchat) API. -| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | +| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------| | `AriaForConditionalGeneration` | Aria | T + I+ | `rhymes-ai/Aria` | | | ✅︎ | | `AyaVisionForConditionalGeneration` | Aya Vision | T + I+ | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ | @@ -647,7 +647,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it! -| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | +| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------| | `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ | ✅︎ | @@ -726,7 +726,7 @@ Some models are supported only via the [Transformers backend](#transformers). Th Speech2Text models trained specifically for Automatic Speech Recognition. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | | | `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | | ✅︎ | ✅︎ | @@ -744,7 +744,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A The following table lists those that are tested in vLLM. -| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | +| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------| | `LlavaNextForConditionalGeneration`C | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | | | `Phi3VForCausalLM`C | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | | @@ -760,7 +760,7 @@ The following table lists those that are tested in vLLM. Cross-encoder and reranker models are a subset of classification models that accept two prompts as input. These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API. -| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | +| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][parallelism-scaling] | [V1](gh-issue:8779) | |-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------| | `JinaVLForSequenceClassification` | JinaVL-based | T + IE+ | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ | diff --git a/docs/serving/distributed_serving.md b/docs/serving/parallelism_scaling.md similarity index 99% rename from docs/serving/distributed_serving.md rename to docs/serving/parallelism_scaling.md index fc9d9f8a34..fa7fc1b290 100644 --- a/docs/serving/distributed_serving.md +++ b/docs/serving/parallelism_scaling.md @@ -1,4 +1,4 @@ -# Distributed inference and serving +# Parallelism and Scaling ## Distributed inference strategies for a single-model replica diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md index f9ba32c58c..9715ad66d9 100644 --- a/docs/usage/troubleshooting.md +++ b/docs/usage/troubleshooting.md @@ -289,7 +289,7 @@ Traceback (most recent call last): ... ``` -This indicates vLLM failed to initialize the NCCL communicator, possibly due to a missing `IPC_LOCK` linux capability or an unmounted `/dev/shm`. Refer to [Distributed Inference and Serving](../serving/distributed_serving.md#running-vllm-on-multiple-nodes) for guidance on properly configuring the environment for distributed serving. +This indicates vLLM failed to initialize the NCCL communicator, possibly due to a missing `IPC_LOCK` linux capability or an unmounted `/dev/shm`. Refer to [Enabling GPUDirect RDMA](../serving/parallelism_scaling.md#enabling-gpudirect-rdma) for guidance on properly configuring the environment for GPUDirect RDMA. ## Known Issues From fe6d8257a1859cdd938cb2ec2a63a45c666dcca3 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Fri, 8 Aug 2025 15:06:37 -0700 Subject: [PATCH 096/932] [gpt-oss] Support tool call and implement MCP tool server (#22427) Signed-off-by: Chen Zhang --- vllm/entrypoints/harmony_utils.py | 5 +- vllm/entrypoints/openai/api_server.py | 6 +- vllm/entrypoints/openai/serving_responses.py | 185 +++++++++++-------- vllm/entrypoints/tool_server.py | 119 +++++++++++- 4 files changed, 233 insertions(+), 82 deletions(-) diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index 87e76e08a0..efca1472e4 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -237,7 +237,10 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]: id=f"rs_{random_uuid()}", summary=[], type="reasoning", - text=content.text, + content=[ + ResponseReasoningTextContent(text=content.text, + type="reasoning_text") + ], status=None, ) output_items.append(reasoning_item) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index c695ea8b5a..00eaba8c87 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -94,7 +94,8 @@ from vllm.entrypoints.openai.serving_tokenization import ( from vllm.entrypoints.openai.serving_transcription import ( OpenAIServingTranscription, OpenAIServingTranslation) from vllm.entrypoints.openai.tool_parsers import ToolParserManager -from vllm.entrypoints.tool_server import DemoToolServer, ToolServer +from vllm.entrypoints.tool_server import (DemoToolServer, MCPToolServer, + ToolServer) from vllm.entrypoints.utils import (cli_env_setup, load_aware_call, log_non_default_args, with_cancellation) from vllm.logger import init_logger @@ -1635,6 +1636,9 @@ async def init_app_state( if args.tool_server == "demo": tool_server: Optional[ToolServer] = DemoToolServer() + elif args.tool_server: + tool_server = MCPToolServer() + await tool_server.add_tool_server(args.tool_server) else: tool_server = None diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index a7554e0d68..1e3746e956 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -4,6 +4,7 @@ import asyncio import time from collections.abc import AsyncGenerator, AsyncIterator +from contextlib import AsyncExitStack from copy import copy from http import HTTPStatus from typing import Any, Callable, Final, Optional, Union @@ -226,65 +227,114 @@ class OpenAIServingResponses(OpenAIServing): # Schedule the request and get the result generator. generators: list[AsyncGenerator[ConversationContext, None]] = [] - try: - tool_sessions: dict[str, Any] = {} - for i, engine_prompt in enumerate(engine_prompts): - default_max_tokens = self.max_model_len - len( - engine_prompt["prompt_token_ids"]) - sampling_params = request.to_sampling_params( - default_max_tokens, self.default_sampling_params) - trace_headers = (None if raw_request is None else await - self._get_trace_headers(raw_request.headers)) - - context: ConversationContext - if self.use_harmony: - if request.stream: - context = StreamingHarmonyContext( - messages, tool_sessions) - else: - context = HarmonyContext(messages, tool_sessions) + builtin_tool_list: list[str] = [] + if self.use_harmony and self.tool_server is not None: + if self.tool_server.has_tool("browser"): + builtin_tool_list.append("browser") + if self.tool_server.has_tool("python"): + builtin_tool_list.append("python") + async with AsyncExitStack() as exit_stack: + try: + if self.tool_server is not None: + # TODO: initialize tool sessions lazily when the session + # is actually used. + tool_session_ctxs: dict[str, Any] = { + tool_name: + exit_stack.enter_async_context( + self.tool_server.new_session(tool_name)) + for tool_name in builtin_tool_list + } + tool_sessions = {} + for tool_name in builtin_tool_list: + tool_sessions[tool_name] = ( + await tool_session_ctxs[tool_name]) else: - context = SimpleContext() - generator = self._generate_with_builtin_tools( - request_id=request.request_id, - request_prompt=request_prompts[i], - engine_prompt=engine_prompt, - sampling_params=sampling_params, - context=context, - lora_request=lora_request, - priority=request.priority, - trace_headers=trace_headers, + assert len(builtin_tool_list) == 0 + tool_sessions = {} + for i, engine_prompt in enumerate(engine_prompts): + default_max_tokens = self.max_model_len - len( + engine_prompt["prompt_token_ids"]) + sampling_params = request.to_sampling_params( + default_max_tokens, self.default_sampling_params) + + trace_headers = (None if raw_request is None else await + self._get_trace_headers( + raw_request.headers)) + + context: ConversationContext + if self.use_harmony: + if request.stream: + context = StreamingHarmonyContext( + messages, tool_sessions) + else: + context = HarmonyContext(messages, tool_sessions) + else: + context = SimpleContext() + generator = self._generate_with_builtin_tools( + request_id=request.request_id, + request_prompt=request_prompts[i], + engine_prompt=engine_prompt, + sampling_params=sampling_params, + context=context, + lora_request=lora_request, + priority=request.priority, + trace_headers=trace_headers, + ) + generators.append(generator) + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + assert len(generators) == 1 + result_generator, = generators + + # Store the input messages. + if request.store: + self.msg_store[request.request_id] = messages + + if request.background: + created_time = int(time.time()) + response = ResponsesResponse.from_request( + request, + sampling_params, + model_name=model_name, + created_time=created_time, + output=[], + status="queued", + usage=None, ) - generators.append(generator) - except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + async with self.response_store_lock: + self.response_store[response.id] = response - assert len(generators) == 1 - result_generator, = generators + # Run the request in the background. + task = asyncio.create_task( + self._run_background_request( + request, + sampling_params, + result_generator, + context, + model_name, + tokenizer, + request_metadata, + created_time, + ), + name=f"create_{response.id}", + ) - # Store the input messages. - if request.store: - self.msg_store[request.request_id] = messages + # For cleanup. + response_id = response.id + self.background_tasks[response_id] = task + task.add_done_callback( + lambda _: self.background_tasks.pop(response_id, None)) + return response - if request.background: - created_time = int(time.time()) - response = ResponsesResponse.from_request( - request, - sampling_params, - model_name=model_name, - created_time=created_time, - output=[], - status="queued", - usage=None, - ) - async with self.response_store_lock: - self.response_store[response.id] = response + if request.stream: + raise NotImplementedError( + "Streaming responses are not supported") - # Run the request in the background. - task = asyncio.create_task( - self._run_background_request( + try: + return await self.responses_full_generator( request, sampling_params, result_generator, @@ -292,33 +342,10 @@ class OpenAIServingResponses(OpenAIServing): model_name, tokenizer, request_metadata, - created_time, - ), - name=f"create_{response.id}", - ) - - # For cleanup. - response_id = response.id - self.background_tasks[response_id] = task - task.add_done_callback( - lambda _: self.background_tasks.pop(response_id, None)) - return response - - if request.stream: - raise NotImplementedError("Streaming responses are not supported") - - try: - return await self.responses_full_generator( - request, - sampling_params, - result_generator, - context, - model_name, - tokenizer, - request_metadata, - ) - except Exception as e: - return self.create_error_response(str(e)) + ) + except Exception as e: + return self.create_error_response(str(e)) + return self.create_error_response("Should not reach here") async def _make_request( self, diff --git a/vllm/entrypoints/tool_server.py b/vllm/entrypoints/tool_server.py index 769c40e8cc..352704b2b3 100644 --- a/vllm/entrypoints/tool_server.py +++ b/vllm/entrypoints/tool_server.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from contextlib import AbstractAsyncContextManager, asynccontextmanager -from typing import Any, Optional +from typing import TYPE_CHECKING, Any, Optional from openai_harmony import ToolNamespaceConfig @@ -11,6 +11,61 @@ from vllm.logger import init_logger logger = init_logger(__name__) +if TYPE_CHECKING: + from mcp.types import ListToolsResult + + +async def list_server_and_tools(server_url: str): + from mcp import ClientSession + from mcp.client.sse import sse_client + + async with sse_client(url=server_url) as streams, ClientSession( + *streams) as session: + initialize_response = await session.initialize() + list_tools_response = await session.list_tools() + return initialize_response, list_tools_response + + +def trim_schema(schema: dict) -> dict: + # Turn JSON Schema from MCP generated into Harmony's variant. + if "title" in schema: + del schema["title"] + if "default" in schema and schema["default"] is None: + del schema["default"] + if "anyOf" in schema: + # Turn "anyOf": [{"type": "type-1"}, {"type": "type-2"}] + # into "type": ["type-1", "type-2"] + # if there's more than 1 types, also remove "null" type as Harmony will + # just ignore it + types = [ + type_dict["type"] for type_dict in schema["anyOf"] + if type_dict["type"] != 'null' + ] + schema["type"] = types + del schema["anyOf"] + if "properties" in schema: + schema["properties"] = { + k: trim_schema(v) + for k, v in schema["properties"].items() + } + return schema + + +def post_process_tools_description( + list_tools_result: "ListToolsResult") -> "ListToolsResult": + # Adapt the MCP tool result for Harmony + for tool in list_tools_result.tools: + tool.inputSchema = trim_schema(tool.inputSchema) + + # Some tools schema don't need to be part of the prompt (e.g. simple text + # in text out for Python) + list_tools_result.tools = [ + tool for tool in list_tools_result.tools + if getattr(tool.annotations, "include_in_prompt", True) + ] + + return list_tools_result + class ToolServer(ABC): @@ -38,6 +93,66 @@ class ToolServer(ABC): ... +class MCPToolServer(ToolServer): + + def __init__(self): + try: + import mcp # noqa: F401 + except ImportError: + raise ImportError( + "mcp is not installed. Please run `pip install mcp` to use " + "MCPToolServer.") from None + self.harmony_tool_descriptions = {} + + async def add_tool_server(self, server_url: str): + from mcp.types import ToolDescription + tool_urls = server_url.split(",") + self.harmony_tool_descriptions = {} + self.urls: dict[str, str] = {} + for url in tool_urls: + url = f"http://{url}/sse" + initialize_response, list_tools_response = ( + await list_server_and_tools(url)) + + list_tools_response = post_process_tools_description( + list_tools_response) + + tool_from_mcp = ToolNamespaceConfig( + name=initialize_response.serverInfo.name, + description=initialize_response.instructions, + tools=[ + ToolDescription.new(name=tool.name, + description=tool.description, + parameters=tool.inputSchema) + for tool in list_tools_response.tools + ]) + self.harmony_tool_descriptions[tool_from_mcp.name] = tool_from_mcp + if tool_from_mcp.name not in self.urls: + self.urls[tool_from_mcp.name] = url + else: + logger.warning( + "Tool %s already exists. Ignoring duplicate tool server %s", + tool_from_mcp.name, url) + + def has_tool(self, tool_name: str): + return tool_name in self.harmony_tool_descriptions + + def get_tool_description(self, tool_name: str): + return self.harmony_tool_descriptions.get(tool_name) + + @asynccontextmanager + async def new_session(self, tool_name: str): + from mcp import ClientSession + from mcp.client.sse import sse_client + url = self.urls.get(tool_name) + if not url: + raise KeyError(f"Tool '{tool_name}' is not supported") + async with sse_client(url=url) as streams, ClientSession( + *streams) as session: + await session.initialize() + yield session + + class DemoToolServer(ToolServer): def __init__(self): @@ -67,4 +182,6 @@ class DemoToolServer(ToolServer): @asynccontextmanager async def new_session(self, tool_name: str): + if tool_name not in self.tools: + raise KeyError(f"Tool '{tool_name}' is not supported") yield self.tools[tool_name] From cd9b9de1fb009cf607403ba08961f2a3f869931d Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 8 Aug 2025 19:09:42 -0400 Subject: [PATCH 097/932] [BugFix] Fix IMA FlashMLA full cuda-graph and DP + Update FlashMLA (#21691) Signed-off-by: Lucas Wilkinson Co-authored-by: yewentao256 Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- cmake/external_projects/flashmla.cmake | 8 ++-- vllm/attention/ops/flashmla.py | 1 - vllm/v1/attention/backends/mla/flashmla.py | 56 ++++++++++++++-------- 3 files changed, 40 insertions(+), 25 deletions(-) diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake index 6291475164..ee6768bce2 100644 --- a/cmake/external_projects/flashmla.cmake +++ b/cmake/external_projects/flashmla.cmake @@ -19,7 +19,7 @@ else() FetchContent_Declare( flashmla GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git - GIT_TAG 575f7724b9762f265bbee5889df9c7d630801845 + GIT_TAG 0e43e774597682284358ff2c54530757b654b8d1 GIT_PROGRESS TRUE CONFIGURE_COMMAND "" BUILD_COMMAND "" @@ -37,9 +37,9 @@ cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS) set(FlashMLA_SOURCES ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp - ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_bf16_sm90.cu - ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_fp16_sm90.cu - ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_metadata.cu) + ${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu + ${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu + ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu) set(FlashMLA_INCLUDES ${flashmla_SOURCE_DIR}/csrc/cutlass/include diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py index b85f27ac41..1af26dfc3d 100644 --- a/vllm/attention/ops/flashmla.py +++ b/vllm/attention/ops/flashmla.py @@ -91,7 +91,6 @@ def flash_mla_with_kvcache( out, softmax_lse = torch.ops._flashmla_C.fwd_kvcache_mla( q, k_cache, - None, head_dim_v, cache_seqlens, block_table, diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index b5aecff993..2b0f52cf80 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -70,6 +70,22 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): self.cg_buf_tile_scheduler_metadata = None self.cg_buf_num_splits = None + device_properties = torch.cuda.get_device_properties(self.device) + num_sms = device_properties.multi_processor_count + + if self.compilation_config.full_cuda_graph: + self.cg_buf_tile_scheduler_metadata = torch.zeros( + # Upper bound on size (<= #SMs, TileSchedulerMetaDataSize) + # TileSchedulerMetaDataSize = 8 + (num_sms, 8), + device=self.device, + dtype=torch.int32, + ) + self.cg_buf_num_splits = torch.empty( + (vllm_config.scheduler_config.max_num_seqs + 1), + device=self.device, + dtype=torch.int32) + def _build_decode(self, block_table_tensor: torch.Tensor, seq_lens: torch.Tensor) -> FlashMLADecodeMetadata: tile_scheduler_metadata, num_splits = \ @@ -80,28 +96,28 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): ) if self.compilation_config.full_cuda_graph: - # First time around (CUDAGraph capture), allocate the static buffer - if self.cg_buf_tile_scheduler_metadata is None: - self.cg_buf_tile_scheduler_metadata = tile_scheduler_metadata - self.cg_buf_num_splits = num_splits - else: - assert self.cg_buf_num_splits is not None + assert self.cg_buf_tile_scheduler_metadata is not None + assert self.cg_buf_num_splits is not None - # Metadata per-SM, fixed size (#SMs, TileMetadataSize) - assert (self.cg_buf_tile_scheduler_metadata.size() == - tile_scheduler_metadata.size()) - self.cg_buf_tile_scheduler_metadata.\ - copy_(tile_scheduler_metadata) - tile_scheduler_metadata = self.cg_buf_tile_scheduler_metadata + sm_parts = tile_scheduler_metadata.size(0) + # Metadata per-SM, upper bound on size (<= #SMs, TileMetadataSize) + assert sm_parts <= self.cg_buf_tile_scheduler_metadata.size(0) + tile_scheduler_metadata_view = \ + self.cg_buf_tile_scheduler_metadata[:sm_parts] + tile_scheduler_metadata_view.copy_(tile_scheduler_metadata) + tile_scheduler_metadata = tile_scheduler_metadata_view - # Num splits is per-batch, varying size (batch_size,) - n = num_splits.size(0) - # make sure static buffer is large enough - assert n <= self.cg_buf_num_splits.size(0) - num_splits_view = self.cg_buf_num_splits[:n] - num_splits_view.copy_(num_splits) - self.cg_buf_num_splits[n:].fill_(0) # fill the rest with 0s - num_splits = num_splits_view + # Num splits is per-batch, varying size (batch_size,) + n = num_splits.size(0) + # make sure static buffer is large enough + assert n <= self.cg_buf_num_splits.size(0) + num_splits_view = self.cg_buf_num_splits[:n] + num_splits_view.copy_(num_splits) + # Num splits needs to monotonically increasing + # (with: https://github.com/vllm-project/FlashMLA/pull/3, otherwise + # it needs to monotonically increasing by 1) + self.cg_buf_num_splits[n:].fill_(num_splits[-1]) + num_splits = num_splits_view return FlashMLADecodeMetadata( block_table=block_table_tensor, From f703b923f3885157cf02b951c42f967c25329b01 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Fri, 8 Aug 2025 19:09:59 -0400 Subject: [PATCH 098/932] [Misc] DeepGEMM : Avoid JIT generation in the hot-path (#22215) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- .../layers/fused_moe/deep_gemm_moe.py | 12 - .../layers/fused_moe/fused_moe.py | 55 +++-- .../model_executor/warmup/deep_gemm_warmup.py | 219 ++++++++++++++++++ vllm/model_executor/warmup/kernel_warmup.py | 20 ++ vllm/v1/worker/gpu_worker.py | 5 + 5 files changed, 274 insertions(+), 37 deletions(-) create mode 100644 vllm/model_executor/warmup/deep_gemm_warmup.py create mode 100644 vllm/model_executor/warmup/kernel_warmup.py diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index ba7105c83a..9b8175f42a 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -237,18 +237,6 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): assert w1_scale is not None assert w2_scale is not None - if not env.VLLM_SKIP_DEEP_GEMM_WARMUP: - # DeepGemm JITs the grouped-gemm kernels. We don't want the JIT'ing - # to happen during actual model-inference. The - # `warmup_deepgemm_kernels` function is a `run_once` decorated - # function that executes during the model profile run. This warmup - # should create all the required JITs for the current model. - warmup_deepgemm_gg_contiguous_kernels(w1, - w2, - w1_scale, - w2_scale, - num_topk=topk_ids.size(1)) - a1q = hidden_states _, N, K = w1.size() diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 597af08c3c..f4f5457ebc 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -4,6 +4,9 @@ import functools import json import os +# torch.compile needs typing.List. It will fail torch.library.infer_schema +# otherwise +from typing import List # noqa: UP035 from typing import Any, Callable, Optional import torch @@ -998,29 +1001,30 @@ def get_config_dtype_str( return None -def inplace_fused_experts(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - activation: str = "silu", - is_act_and_mul: bool = True, - apply_router_weight_on_input: bool = False, - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - use_mxfp4_w4a4: bool = False, - per_channel_quant: bool = False, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - w1_zp: Optional[torch.Tensor] = None, - w2_zp: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None) -> None: +def inplace_fused_experts( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str = "silu", + is_act_and_mul: bool = True, + apply_router_weight_on_input: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_mxfp4_w4a4: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None) -> None: #noqa: UP006 fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True, activation, is_act_and_mul, apply_router_weight_on_input, use_fp8_w8a8, @@ -1082,7 +1086,7 @@ def flashinfer_fused_moe_blockscale_fp8( intermediate_size: int, expert_offset: int, local_num_experts: int, - block_shape: list[int], + block_shape: List[int], #noqa: UP006 routed_scaling: float = 1.0) -> torch.Tensor: from vllm.utils.flashinfer import flashinfer_trtllm_fp8_block_scale_moe assert top_k <= global_num_experts @@ -1264,7 +1268,8 @@ def outplace_fused_experts( w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None) -> torch.Tensor: + block_shape: Optional[List[int]] = None, #noqa: UP006 +) -> torch.Tensor: return fused_experts_impl( hidden_states, w1, w2, topk_weights, topk_ids, False, activation, is_act_and_mul, apply_router_weight_on_input, use_fp8_w8a8, diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py new file mode 100644 index 0000000000..74599fa44c --- /dev/null +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -0,0 +1,219 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Warmup deep_gemm kernels. +DeepGEMM JIT's the kernels. The warmup aims to JIT all the kernels that would +be used during model execution beforehand. +""" + +import torch +from tqdm import tqdm + +import vllm.envs as envs +from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts +from vllm.model_executor.layers.fused_moe.deep_gemm_utils import ( + compute_aligned_M, deep_gemm_block_shape) +from vllm.model_executor.layers.fused_moe.layer import FusedMoE +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEModularKernel) +from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( + TritonOrDeepGemmExperts) +from vllm.model_executor.layers.linear import LinearBase +from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod +from vllm.utils.deep_gemm import fp8_gemm_nt, m_grouped_fp8_gemm_nt_contiguous + + +def _extract_data_from_linear_base_module( + m: torch.nn.Module) -> tuple[torch.Tensor, torch.Tensor, list[int]]: + """ + Extract weights, weight scales and quantization block sizes from the given + LinearBase module. + """ + assert isinstance(m, LinearBase) + assert isinstance(m.quant_method, Fp8LinearMethod) + assert m.quant_method.block_quant + assert m.quant_method.quant_config is not None + + w = m.weight + ws = m.weight_scale_inv + quant_block_size = m.quant_method.quant_config.weight_block_size + + assert isinstance(w, torch.Tensor) + assert isinstance(ws, torch.Tensor) + assert quant_block_size is not None + return (w, ws, quant_block_size) + + +def _extract_data_from_fused_moe_module( + m: torch.nn.Module +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int]: + """ + Extract weights, weight scales and num_topk from FusedMoE module. + """ + assert isinstance(m, FusedMoE) + w13 = m.w13_weight + w13_s = m.w13_weight_scale_inv + w2 = m.w2_weight + w2_s = m.w2_weight_scale_inv + num_topk = m.top_k + + assert isinstance(w13, torch.Tensor) + assert isinstance(w13_s, torch.Tensor) + assert isinstance(w2, torch.Tensor) + assert isinstance(w2_s, torch.Tensor) + return w13, w13_s, w2, w2_s, num_topk + + +def _fp8_linear_may_use_deep_gemm(module: torch.nn.Module) -> bool: + """ + Return True if the input module/layer could be processed with DeepGEMM. + """ + block_size = deep_gemm_block_shape()[0] + if not (isinstance(module, LinearBase) + and isinstance(module.quant_method, Fp8LinearMethod) + and module.quant_method.block_quant): + return False + + w, _, block_sizes = _extract_data_from_linear_base_module(module) + return (block_sizes == deep_gemm_block_shape() and w.ndim == 2 + and w.shape[0] % block_size == 0 and w.shape[1] % block_size == 0) + + +def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool: + if not (isinstance(module, FusedMoE) + and module.moe_config.quant_dtype == torch.float8_e4m3fn + and module.moe_config.block_shape == deep_gemm_block_shape()): + return False + + if not isinstance(module.quant_method.fused_experts, + FusedMoEModularKernel): + # fused_experts could invoke deep_gemm_moe_fp8 + return True + + mk: FusedMoEModularKernel = module.quant_method.fused_experts + # Further check if the ModularKernel implementation uses the DeepGemmExperts + return isinstance(mk.fused_experts, + (DeepGemmExperts, TritonOrDeepGemmExperts)) + + +FP8_GEMM_NT_WARMUP_CACHE: set[torch.Size] = set() + + +def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor, + max_tokens: int): + if w.size() in FP8_GEMM_NT_WARMUP_CACHE: + return + + n, k = w.size() + block_m = deep_gemm_block_shape()[0] + + device = w.device + a1q = torch.empty((max_tokens, k), + device=device, + dtype=torch.float8_e4m3fn) + a1q_scales = torch.empty((max_tokens, k // block_m), + device=device, + dtype=torch.float32) + out = torch.empty((max_tokens, n), device=device, dtype=torch.bfloat16) + + pbar = tqdm(total=max_tokens, + desc=f"DeepGemm(fp8_gemm_nt) warmup (W={w.size()})") + num_tokens = max_tokens + while num_tokens > 0: + fp8_gemm_nt((a1q[:num_tokens], a1q_scales[:num_tokens]), (w, ws), + out[:num_tokens]) + pbar.update(1) + num_tokens -= 1 + + FP8_GEMM_NT_WARMUP_CACHE.add(w.size()) + + +GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE: set[torch.Size] = set() + + +def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + num_topk: int): + if (w1.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE + and w2.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE): + return + + assert w1.size(0) == w2.size(0), ( + "w1 and w2 must have the same number of experts") + + block_m = deep_gemm_block_shape()[0] + num_experts = w1.size(0) + device = w1.device + + # This is the maximum GroupedGemm M size that we expect to run + # the grouped_gemm with. + MAX_M = compute_aligned_M(envs.VLLM_FUSED_MOE_CHUNK_SIZE, + num_topk, + num_experts, + block_m, + expert_tokens_meta=None) + # Distribute expert-ids evenly. + MAX_BLOCKS = MAX_M // block_m + expert_ids_block = torch.randint(low=0, + high=num_experts, + size=(MAX_BLOCKS, ), + device=device, + dtype=torch.int32) + expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0) + + def _warmup(w: torch.Tensor, w_scale: torch.Tensor): + + _, n, k = w.size() + a1q = torch.empty((MAX_M, k), device=device, dtype=torch.float8_e4m3fn) + a1q_scales = torch.empty((MAX_M, k // block_m), + device=device, + dtype=torch.float32) + out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16) + + pbar = tqdm( + total=MAX_BLOCKS, + desc= + f"DeepGemm(m_grouped_fp8_gemm_nt_contiguous) warmup (W={w.size()})" + ) + num_tokens = MAX_M + while num_tokens > 0: + m_grouped_fp8_gemm_nt_contiguous( + (a1q[:num_tokens], a1q_scales[:num_tokens]), (w, w_scale), + out[:num_tokens], expert_ids[:num_tokens]) + pbar.update(1) + num_tokens = num_tokens - block_m + + for w, ws in [(w1, w1_scale), (w2, w2_scale)]: + if w.size() not in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE: + _warmup(w, ws) + GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE.add(w.size()) + + +def deepgemm_fp8_gemm_nt_warmup(model: torch.nn.Module, max_tokens: int): + dg_modules = [ + m for m in model.modules() if _fp8_linear_may_use_deep_gemm(m) + ] + + for dgm in dg_modules: + w, ws, _ = _extract_data_from_linear_base_module(dgm) + _deepgemm_fp8_gemm_nt_warmup(w=w, ws=ws, max_tokens=max_tokens) + + +def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model: torch.nn.Module): + dg_modules = [ + m for m in model.modules() + if _fused_moe_grouped_gemm_may_use_deep_gemm(m) + ] + + for dgm in dg_modules: + w13, w13_scale, w2, w2_scale, num_topk = ( + _extract_data_from_fused_moe_module(dgm)) + _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup( + w13, w2, w13_scale, w2_scale, num_topk) + + +def deep_gemm_warmup(model: torch.nn.Module, max_tokens: int): + deepgemm_fp8_gemm_nt_warmup(model, max_tokens) + deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model) diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py new file mode 100644 index 0000000000..10f2dc0252 --- /dev/null +++ b/vllm/model_executor/warmup/kernel_warmup.py @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Warmup kernels used during model execution. +This is useful specifically for JIT'ed kernels as we don't want JIT'ing to +happen during model execution. +""" +import torch + +import vllm.envs as envs +from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup +from vllm.utils.deep_gemm import is_deep_gemm_supported + + +def kernel_warmup(model: torch.nn.Module, max_tokens: int): + do_deep_gemm_warmup = (envs.VLLM_USE_DEEP_GEMM + and is_deep_gemm_supported() + and not envs.VLLM_SKIP_DEEP_GEMM_WARMUP) + if do_deep_gemm_warmup: + deep_gemm_warmup(model, max_tokens) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 7fca245c1b..0ea23921a0 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -21,6 +21,7 @@ from vllm.distributed.parallel_state import get_pp_group, get_tp_group from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed +from vllm.model_executor.warmup.kernel_warmup import kernel_warmup from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.tasks import SupportedTask @@ -338,6 +339,10 @@ class Worker(WorkerBase): self.model_runner._dummy_sampler_run( hidden_states=last_hidden_states) + # Warmup kernels used during model execution + kernel_warmup(self.get_model(), + max_tokens=self.scheduler_config.max_num_batched_tokens) + # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. set_random_seed(self.model_config.seed) From bd875d2eb71b130cbc2b68bf0e2dd285f5c7348d Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Sat, 9 Aug 2025 01:10:25 +0200 Subject: [PATCH 099/932] [Bugfix] Update FA commit hash (#22546) Signed-off-by: Thomas Parnell --- cmake/external_projects/vllm_flash_attn.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index 59b99e9e20..d24d8e8e5e 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 6dbc6e011a3ebe9349eeb74578940dd7095436ba + GIT_TAG 93cf5a08f421a3efd0c4a7e005ef8f742b578ce0 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn From 41b965575136a72c21927b87a16bd7460b3a3cf8 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 9 Aug 2025 00:20:58 +0100 Subject: [PATCH 100/932] Skip Qwen 1 in CI because remote code is no longer compatible with Transformers (#22536) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/registry.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/models/registry.py b/tests/models/registry.py index 2c2d094e04..b1952ce9c2 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -278,6 +278,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { transformers_version_reason="vLLM impl inherits PreTrainedModel and clashes with get_input_embeddings", # noqa: E501 trust_remote_code=True), "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat", + max_transformers_version="4.53", + transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers", # noqa: E501 trust_remote_code=True), "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-0.5B-Instruct", extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}), # noqa: E501 From 2fcf6b27b6902a18aaf4a6fb8cf5c7efc8afc731 Mon Sep 17 00:00:00 2001 From: Guy Stone Date: Fri, 8 Aug 2025 19:22:35 -0400 Subject: [PATCH 101/932] [Docs] fix broken links in metrics.md (#22315) Signed-off-by: Guy Stone Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/design/metrics.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/design/metrics.md b/docs/design/metrics.md index 1f65331d3c..b01838883f 100644 --- a/docs/design/metrics.md +++ b/docs/design/metrics.md @@ -57,11 +57,11 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics` - `vllm:spec_decode_num_draft_tokens_total` (Counter) - `vllm:spec_decode_num_emitted_tokens_total` (Counter) -These are documented under [Inferencing and Serving -> Production Metrics](../../usage/metrics.md). +These are documented under [Inferencing and Serving -> Production Metrics](../usage/metrics.md). ### Grafana Dashboard -vLLM also provides [a reference example](../../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard. +vLLM also provides [a reference example](../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard. The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important: @@ -455,7 +455,7 @@ In general: [an escape hatch](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#show-hidden-metrics) for some time before deleting them. -See the [deprecation policy](../../contributing/deprecation_policy.md) for +See the [deprecation policy](../contributing/deprecation_policy.md) for the project-wide deprecation policy. ### Unimplemented - `vllm:tokens_total` @@ -655,7 +655,7 @@ v0 has support for OpenTelemetry tracing: - Added by - Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces` - [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/) -- [User-facing docs](../../examples/online_serving/opentelemetry.md) +- [User-facing docs](../examples/online_serving/opentelemetry.md) - [Blog post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f) - [IBM product docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview) From baece8c3d26484b918fa123c884e6ea81237b661 Mon Sep 17 00:00:00 2001 From: yyweiss <70619747+yyweiss@users.noreply.github.com> Date: Sat, 9 Aug 2025 02:23:44 +0300 Subject: [PATCH 102/932] [Frontend] Add unix domain socket support (#18097) Signed-off-by: Signed-off-by: yyw --- docs/cli/README.md | 3 ++ tests/entrypoints/openai/test_uds.py | 43 +++++++++++++++++++++++++++ tests/utils.py | 27 ++++++++++++----- vllm/entrypoints/openai/api_server.py | 27 ++++++++++++----- vllm/entrypoints/openai/cli_args.py | 2 ++ 5 files changed, 86 insertions(+), 16 deletions(-) create mode 100644 tests/entrypoints/openai/test_uds.py diff --git a/docs/cli/README.md b/docs/cli/README.md index b1371c82a4..a7de6d7192 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -29,6 +29,9 @@ Start the vLLM OpenAI Compatible API server. # Specify the port vllm serve meta-llama/Llama-2-7b-hf --port 8100 + # Serve over a Unix domain socket + vllm serve meta-llama/Llama-2-7b-hf --uds /tmp/vllm.sock + # Check with --help for more options # To list all groups vllm serve --help=listgroup diff --git a/tests/entrypoints/openai/test_uds.py b/tests/entrypoints/openai/test_uds.py new file mode 100644 index 0000000000..5c39869a79 --- /dev/null +++ b/tests/entrypoints/openai/test_uds.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from tempfile import TemporaryDirectory + +import httpx +import pytest + +from vllm.version import __version__ as VLLM_VERSION + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" + + +@pytest.fixture(scope="module") +def server(): + with TemporaryDirectory() as tmpdir: + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + "--max-num-seqs", + "128", + "--uds", + f"{tmpdir}/vllm.sock", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.mark.asyncio +async def test_show_version(server: RemoteOpenAIServer): + transport = httpx.HTTPTransport(uds=server.uds) + client = httpx.Client(transport=transport) + response = client.get(server.url_for("version")) + response.raise_for_status() + + assert response.json() == {"version": VLLM_VERSION} diff --git a/tests/utils.py b/tests/utils.py index 741b4401cc..18fcde9491 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -17,6 +17,7 @@ from pathlib import Path from typing import Any, Callable, Literal, Optional, Union import cloudpickle +import httpx import openai import pytest import requests @@ -88,10 +89,12 @@ class RemoteOpenAIServer: raise ValueError("You have manually specified the port " "when `auto_port=True`.") - # Don't mutate the input args - vllm_serve_args = vllm_serve_args + [ - "--port", str(get_open_port()) - ] + # No need for a port if using unix sockets + if "--uds" not in vllm_serve_args: + # Don't mutate the input args + vllm_serve_args = vllm_serve_args + [ + "--port", str(get_open_port()) + ] if seed is not None: if "--seed" in vllm_serve_args: raise ValueError("You have manually specified the seed " @@ -104,8 +107,13 @@ class RemoteOpenAIServer: subparsers = parser.add_subparsers(required=False, dest="subparser") parser = ServeSubcommand().subparser_init(subparsers) args = parser.parse_args(["--model", model, *vllm_serve_args]) - self.host = str(args.host or 'localhost') - self.port = int(args.port) + self.uds = args.uds + if args.uds: + self.host = None + self.port = None + else: + self.host = str(args.host or 'localhost') + self.port = int(args.port) self.show_hidden_metrics = \ args.show_hidden_metrics_for_version is not None @@ -150,9 +158,11 @@ class RemoteOpenAIServer: def _wait_for_server(self, *, url: str, timeout: float): # run health check start = time.time() + client = (httpx.Client(transport=httpx.HTTPTransport( + uds=self.uds)) if self.uds else requests) while True: try: - if requests.get(url).status_code == 200: + if client.get(url).status_code == 200: break except Exception: # this exception can only be raised by requests.get, @@ -170,7 +180,8 @@ class RemoteOpenAIServer: @property def url_root(self) -> str: - return f"http://{self.host}:{self.port}" + return (f"http://{self.uds.split('/')[-1]}" + if self.uds else f"http://{self.host}:{self.port}") def url_for(self, *parts: str) -> str: return self.url_root + "/" + "/".join(parts) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 00eaba8c87..e5d31c1fd0 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1777,6 +1777,12 @@ def create_server_socket(addr: tuple[str, int]) -> socket.socket: return sock +def create_server_unix_socket(path: str) -> socket.socket: + sock = socket.socket(family=socket.AF_UNIX, type=socket.SOCK_STREAM) + sock.bind(path) + return sock + + def validate_api_server_args(args): valid_tool_parses = ToolParserManager.tool_parsers.keys() if args.enable_auto_tool_choice \ @@ -1807,8 +1813,11 @@ def setup_server(args): # workaround to make sure that we bind the port before the engine is set up. # This avoids race conditions with ray. # see https://github.com/vllm-project/vllm/issues/8204 - sock_addr = (args.host or "", args.port) - sock = create_server_socket(sock_addr) + if args.uds: + sock = create_server_unix_socket(args.uds) + else: + sock_addr = (args.host or "", args.port) + sock = create_server_socket(sock_addr) # workaround to avoid footguns where uvicorn drops requests with too # many concurrent requests active @@ -1820,12 +1829,14 @@ def setup_server(args): signal.signal(signal.SIGTERM, signal_handler) - addr, port = sock_addr - is_ssl = args.ssl_keyfile and args.ssl_certfile - host_part = f"[{addr}]" if is_valid_ipv6_address( - addr) else addr or "0.0.0.0" - listen_address = f"http{'s' if is_ssl else ''}://{host_part}:{port}" - + if args.uds: + listen_address = f"unix:{args.uds}" + else: + addr, port = sock_addr + is_ssl = args.ssl_keyfile and args.ssl_certfile + host_part = f"[{addr}]" if is_valid_ipv6_address( + addr) else addr or "0.0.0.0" + listen_address = f"http{'s' if is_ssl else ''}://{host_part}:{port}" return listen_address, sock diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index e89463a03c..e15f65b430 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -72,6 +72,8 @@ class FrontendArgs: """Host name.""" port: int = 8000 """Port number.""" + uds: Optional[str] = None + """Unix domain socket path. If set, host and port arguments are ignored.""" uvicorn_log_level: Literal["debug", "info", "warning", "error", "critical", "trace"] = "info" """Log level for uvicorn.""" From e3edc0a7a8f015b938d5cd77a44638dde28ab3a9 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 9 Aug 2025 00:34:25 +0100 Subject: [PATCH 103/932] Extract `CompilationConfig` from `config.py` (#22524) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/engine/test_arg_utils.py | 33 -- vllm/{config.py => config/__init__.py} | 449 +------------------------ vllm/config/compilation.py | 428 +++++++++++++++++++++++ vllm/config/utils.py | 29 ++ vllm/engine/arg_utils.py | 8 +- 5 files changed, 467 insertions(+), 480 deletions(-) rename vllm/{config.py => config/__init__.py} (91%) create mode 100644 vllm/config/compilation.py create mode 100644 vllm/config/utils.py diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index c282bf0023..93ac18dfcc 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -93,32 +93,6 @@ class NestedConfig: """field""" -@config -@dataclass -class FromCliConfig1: - field: int = 1 - """field""" - - @classmethod - def from_cli(cls, cli_value: str): - inst = cls(**json.loads(cli_value)) - inst.field += 1 - return inst - - -@config -@dataclass -class FromCliConfig2: - field: int = 1 - """field""" - - @classmethod - def from_cli(cls, cli_value: str): - inst = cls(**json.loads(cli_value)) - inst.field += 2 - return inst - - @config @dataclass class DummyConfig: @@ -144,10 +118,6 @@ class DummyConfig: """Dict which will be JSON in CLI""" nested_config: NestedConfig = field(default_factory=NestedConfig) """Nested config""" - from_cli_config1: FromCliConfig1 = field(default_factory=FromCliConfig1) - """Config with from_cli method""" - from_cli_config2: FromCliConfig2 = field(default_factory=FromCliConfig2) - """Different config with from_cli method""" @pytest.mark.parametrize(("type_hint", "expected"), [ @@ -199,9 +169,6 @@ def test_get_kwargs(): assert json_tip in kwargs["json_tip"]["help"] # nested config should should construct the nested config assert kwargs["nested_config"]["type"]('{"field": 2}') == NestedConfig(2) - # from_cli configs should be constructed with the correct method - assert kwargs["from_cli_config1"]["type"]('{"field": 2}').field == 3 - assert kwargs["from_cli_config2"]["type"]('{"field": 2}').field == 4 @pytest.mark.parametrize( diff --git a/vllm/config.py b/vllm/config/__init__.py similarity index 91% rename from vllm/config.py rename to vllm/config/__init__.py index 7147702edd..eaed6017cc 100644 --- a/vllm/config.py +++ b/vllm/config/__init__.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: F401 import ast import copy import enum @@ -10,11 +11,9 @@ import json import textwrap import uuid import warnings -from collections import Counter from collections.abc import Mapping from contextlib import contextmanager -from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass, - replace) +from dataclasses import MISSING, Field, field, fields, is_dataclass, replace from functools import cached_property, lru_cache from importlib.util import find_spec from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional, @@ -22,7 +21,7 @@ from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional, import regex as re import torch -from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator, +from pydantic import (ConfigDict, SkipValidation, field_validator, model_validator) from pydantic.dataclasses import dataclass from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE @@ -31,7 +30,9 @@ from typing_extensions import Self, assert_never, runtime_checkable import vllm.envs as envs from vllm import version -from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass +from vllm.config.compilation import (CompilationConfig, CompilationLevel, + PassConfig) +from vllm.config.utils import ConfigType, config from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.platforms import current_platform @@ -50,8 +51,7 @@ from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes, LayerBlockType, LazyLoader, common_broadcastable_dtype, cuda_device_count_stateless, get_cpu_memory, - get_open_port, is_torch_equal_or_newer, random_uuid, - resolve_obj_by_qualname) + get_open_port, random_uuid) # yapf: enable @@ -70,7 +70,6 @@ if TYPE_CHECKING: from vllm.model_executor.model_loader import LoadFormats from vllm.model_executor.model_loader.tensorizer import TensorizerConfig - ConfigType = type[DataclassInstance] HfOverrides = Union[dict, Callable[[type], type]] else: DataclassInstance = Any @@ -83,7 +82,6 @@ else: BaseModelLoader = Any LoadFormats = Any TensorizerConfig = Any - ConfigType = type HfOverrides = Union[dict[str, Any], Callable[[type], type]] me_quant = LazyLoader("model_executor", globals(), @@ -93,7 +91,6 @@ else: logger = init_logger(__name__) DataclassInstanceT = TypeVar("DataclassInstanceT", bound=DataclassInstance) -ConfigT = TypeVar("ConfigT", bound=ConfigType) TaskOption = Literal["auto", "generate", "embedding", "embed", "classify", "score", "reward", "transcription", "draft"] @@ -234,23 +231,6 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]: return out -def config(cls: ConfigT) -> ConfigT: - """ - A decorator that ensures all fields in a dataclass have default values - and that each field has a docstring. - - If a `ConfigT` is used as a CLI argument itself, the default value provided - by `get_kwargs` will be the result parsing a JSON string as the kwargs - (i.e. `ConfigT(**json.loads(cli_arg))`). However, if a particular `ConfigT` - requires custom construction from CLI (i.e. `CompilationConfig`), it can - have a `from_cli` method, which will be called instead. - - Config validation is performed by the tools/validate_config.py - script, which is invoked during the pre-commit checks. - """ - return cls - - def get_field(cls: ConfigType, name: str) -> Field: """Get the default factory field of a dataclass by name. Used for getting default factory fields in `EngineArgs`.""" @@ -4154,421 +4134,6 @@ class KVEventsConfig: """ -class CompilationLevel: - # constants for the levels of the compilation process - NO_COMPILATION = 0 - DYNAMO_AS_IS = 1 - DYNAMO_ONCE = 2 - PIECEWISE = 3 - - -@config -@dataclass -class PassConfig: - """Configuration for custom Inductor passes. - - This is separate from general `CompilationConfig` so that inductor passes - don't all have access to full configuration - that would create a cycle as - the `PassManager` is set as a property of config.""" - - enable_fusion: bool = field(default_factory=lambda: not envs.VLLM_USE_V1) - """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass.""" - enable_attn_fusion: bool = False - """Whether to enable the custom attention+quant fusion pass.""" - enable_noop: bool = field(default_factory=lambda: not envs.VLLM_USE_V1) - """Whether to enable the custom no-op elimination pass.""" - enable_sequence_parallelism: bool = False - """Whether to enable sequence parallelism.""" - enable_async_tp: bool = False - """Whether to enable async TP.""" - enable_fi_allreduce_fusion: bool = False - """Whether to enable flashinfer allreduce fusion.""" - fi_allreduce_fusion_max_token_num: int = 16384 - """Max number of tokens to used in flashinfer allreduce fusion.""" - - # TODO(luka) better pass enabling system. - - def uuid(self): - """ - Produces a hash unique to the pass configuration. - Any new fields that affect compilation should be added to the hash. - Any future fields that don't affect compilation should be excluded. - """ - return InductorPass.hash_dict(asdict(self)) - - def __post_init__(self) -> None: - if not self.enable_noop: - if self.enable_fusion: - logger.warning_once( - "Fusion enabled but reshape elimination disabled. " - "RMSNorm/SiluMul + quant (fp8) fusion might not work") - if self.enable_attn_fusion: - logger.warning_once( - "Fusion enabled but reshape elimination disabled. " - "Attention + quant (fp8) fusion might not work") - - -@config -@dataclass -class CompilationConfig: - """Configuration for compilation. It has three parts: - - - Top-level Compilation control: - - [`level`][vllm.config.CompilationConfig.level] - - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path] - - [`cache_dir`][vllm.config.CompilationConfig.cache_dir] - - [`backend`][vllm.config.CompilationConfig.backend] - - [`custom_ops`][vllm.config.CompilationConfig.custom_ops] - - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops] - - CudaGraph capture: - - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph] - - [`cudagraph_capture_sizes`] - [vllm.config.CompilationConfig.cudagraph_capture_sizes] - - [`cudagraph_num_of_warmups`] - [vllm.config.CompilationConfig.cudagraph_num_of_warmups] - - [`cudagraph_copy_inputs`] - [vllm.config.CompilationConfig.cudagraph_copy_inputs] - - [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph] - - Inductor compilation: - - [`use_inductor`][vllm.config.CompilationConfig.use_inductor] - - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes] - - [`inductor_compile_config`] - [vllm.config.CompilationConfig.inductor_compile_config] - - [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes] - - custom inductor passes - - Why we have different sizes for cudagraph and inductor: - - cudagraph: a cudagraph captured for a specific size can only be used - for the same size. We need to capture all the sizes we want to use. - - inductor: a graph compiled by inductor for a general shape can be used - for different sizes. Inductor can also compile for specific sizes, - where it can have more information to optimize the graph with fully - static shapes. However, we find the general shape compilation is - sufficient for most cases. It might be beneficial to compile for - certain small batchsizes, where inductor is good at optimizing. - """ - # Top-level Compilation control - level: Optional[int] = None - """The level of compilation: - - - None: If None, we will select the default compilation level. - For V1 engine this is 3, for V0 engine this is 0. - - 0: no compilation. - - 1: dynamo as is. - - 2: dynamo once. - - 3: piecewise compilation.""" - debug_dump_path: str = "" - """The path to dump the debug information.""" - cache_dir: str = "" - """The directory to store the compiled graph, to accelerate Inductor - compilation. By default, it will use model-related information to generate - a cache directory.""" - backend: str = "" - """The backend for compilation. It needs to be a string: - - - "" (empty string): use the default backend. - - "eager"/"openxla"/...: use the specified backend registered in PyTorch. - - "full.module.name": a qualified name which can be used to import the - - backend function. - We use string to avoid serialization issues when using compilation in a - distributed setting. When the compilation level is 1 or 2, the backend is - used for the compilation directly (it sees the whole graph). When the - compilation level is 3, the backend is used for the piecewise compilation - (it sees a part of the graph).""" - custom_ops: list[str] = field(default_factory=list) - """Fine-grained control over which custom ops to enable/disable. Use 'all' - to enable all, 'none' to disable all. Also specify a list of custom op - names to enable (prefixed with a '+'), or disable (prefixed with a '-'). - Examples: - - - 'all,-op1' to enable all except op1 - - 'none,+op1,+op2' to enable only op1 and op2 - - By default, all custom ops are enabled when running without Inductor and - disabled when running with Inductor: level>=PIECEWISE and use_inductor=True. - Inductor generates (fused) Triton kernels for disabled custom ops.""" - splitting_ops: list[str] = field(default_factory=list) - """A list of ops to split the full graph into subgraphs, used in piecewise - compilation.""" - - # Inductor capture - use_inductor: bool = True - """Whether to use inductor compilation: - - - False: inductor compilation is not used. graph runs in eager - (custom_ops enabled by default). - - True: inductor compilation is used (custom_ops disabled by default). - One graph for symbolic shape and one graph per size in compile_sizes - are compiled using configurations in inductor_compile_config. - - This setting is ignored if level1.""" - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - factors: list[Any] = [] - factors.append(self.level) - factors.append(self.backend) - factors.append(self.custom_ops) - factors.append(self.splitting_ops) - factors.append(self.use_inductor) - factors.append(self.inductor_compile_config) - factors.append(self.inductor_passes) - factors.append(self.pass_config.uuid()) - return hashlib.sha256(str(factors).encode()).hexdigest() - - def __repr__(self) -> str: - exclude = { - "static_forward_context": True, - "enabled_custom_ops": True, - "disabled_custom_ops": True, - "compilation_time": True, - "bs_to_padded_graph_size": True, - "traced_files": True, - "inductor_compile_config": { - "post_grad_custom_post_pass": True, - }, - } - - # exclude default attr in pass_config - pass_config_exclude = {} - for attr, default_val in vars(PassConfig()).items(): - if getattr(self.pass_config, attr) == default_val: - pass_config_exclude[attr] = True - if pass_config_exclude: - exclude["pass_config"] = pass_config_exclude - - # The cast to string is necessary because Pydantic is mocked in docs - # builds and sphinx-argparse doesn't know the return type of decode() - return str( - TypeAdapter(CompilationConfig).dump_json( - self, - exclude=exclude, # type: ignore[arg-type] - exclude_unset=True).decode()) - - __str__ = __repr__ - - @classmethod - def from_cli(cls, cli_value: str) -> "CompilationConfig": - """Parse the CLI value for the compilation config. - -O1, -O2, -O3, etc. is handled in FlexibleArgumentParser. - """ - return TypeAdapter(CompilationConfig).validate_json(cli_value) - - def __post_init__(self) -> None: - count_none = self.custom_ops.count("none") - count_all = self.custom_ops.count("all") - assert count_none + count_all <= 1, "Can only specify 'none' or 'all'" - - # TODO(zou3519/luka): There are 2 issues with auto-functionalization V2: - # 1. A bug in PyTorch, fixed in 2.7: - # https://github.com/pytorch/pytorch/issues/147924 - # 2. Custom passes (fusion) rely on auto-functionalization V1 and don't - # work with V2. Addressing this will take extra engineering effort - # and it is not yet a priority. RFC here: - # https://github.com/vllm-project/vllm/issues/14703 - - if is_torch_equal_or_newer("2.6"): - KEY = 'enable_auto_functionalized_v2' - if KEY not in self.inductor_compile_config: - self.inductor_compile_config[KEY] = False - - for k, v in self.inductor_passes.items(): - if not isinstance(v, str): - assert callable(v), ( - f"pass {k} should be callable or a qualified name") - self.inductor_compile_config[k] = v if isinstance( - v, InductorPass) else CallableInductorPass(v) - continue - - # resolve function from qualified name - names = v.split(".") - module = ".".join(names[:-1]) - func_name = names[-1] - func = __import__(module).__dict__[func_name] - self.inductor_compile_config[k] = func if isinstance( - func, InductorPass) else CallableInductorPass(func) - - if isinstance(self.pass_config, dict): - self.pass_config = PassConfig(**self.pass_config) - - def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]: - if self.level == CompilationLevel.NO_COMPILATION: - raise ValueError("No compilation level is set.") - - from torch._dynamo.backends.registry import list_backends - torch_backends = list_backends(exclude_tags=tuple()) - if self.level in [ - CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE - ]: - if self.backend == "": - return "eager" - if self.backend in torch_backends: - return self.backend - return resolve_obj_by_qualname(self.backend) - - # TODO: pass user-specified backend to piecewise compilation - # merge with the config use_inductor - assert self.level == CompilationLevel.PIECEWISE - - from vllm.compilation.backends import VllmBackend - return VllmBackend(vllm_config) - - def init_with_cudagraph_sizes(self, - cudagraph_capture_sizes: list[int]) -> None: - """To complete the initialization of config, - we need to know the cudagraph sizes.""" - - if self.cudagraph_capture_sizes is None: - self.cudagraph_capture_sizes = cudagraph_capture_sizes - else: - # de-duplicate the sizes provided by the config - dedup_sizes = list(set(self.cudagraph_capture_sizes)) - if len(dedup_sizes) < len(self.cudagraph_capture_sizes): - logger.info(("cudagraph sizes specified by model runner" - " %s is overridden by config %s"), - cudagraph_capture_sizes, dedup_sizes) - self.cudagraph_capture_sizes = dedup_sizes - - computed_compile_sizes = [] - if self.compile_sizes is not None: - # de-duplicate the sizes provided by the config - self.compile_sizes = list(set(self.compile_sizes)) - for x in self.compile_sizes: - if isinstance(x, str): - assert x == "cudagraph_capture_sizes", \ - "Unrecognized size type in compile_sizes, " \ - f"expect 'cudagraph_capture_sizes', got {x}" - computed_compile_sizes.extend(self.cudagraph_capture_sizes) - else: - assert isinstance(x, int) - computed_compile_sizes.append(x) - self.compile_sizes = computed_compile_sizes # type: ignore - - # sort to make sure cudagraph capture sizes are in descending order - self.cudagraph_capture_sizes.sort(reverse=True) - self.max_capture_size = self.cudagraph_capture_sizes[ - 0] if self.cudagraph_capture_sizes else 0 - - # pre-compute the mapping from batch size to padded graph size - self.bs_to_padded_graph_size = [ - 0 for i in range(self.max_capture_size + 1) - ] - for end, start in zip(self.cudagraph_capture_sizes, - self.cudagraph_capture_sizes[1:] + [0]): - for bs in range(start, end): - if bs == start: - self.bs_to_padded_graph_size[bs] = start - else: - self.bs_to_padded_graph_size[bs] = end - self.bs_to_padded_graph_size[ - self.max_capture_size] = self.max_capture_size - - def set_splitting_ops_for_v1(self): - # NOTE: this function needs to be called - if self.splitting_ops and self.full_cuda_graph: - raise ValueError("full_cuda_graph cannot be used together with " - "splitting_ops, as Full CUDA graph will override " - f"the splitting_ops: {self.splitting_ops}") - - if not self.splitting_ops: - self.splitting_ops = [] if self.full_cuda_graph else [ - "vllm.unified_attention", - "vllm.unified_attention_with_output", - "vllm.mamba_mixer2", - ] - - @config @dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class VllmConfig: diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py new file mode 100644 index 0000000000..c1b3a61217 --- /dev/null +++ b/vllm/config/compilation.py @@ -0,0 +1,428 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from collections import Counter +from dataclasses import asdict, field +from typing import TYPE_CHECKING, Any, Callable, Optional, Union + +from pydantic import TypeAdapter +from pydantic.dataclasses import dataclass + +import vllm.envs as envs +from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass +from vllm.config.utils import config +from vllm.logger import init_logger +from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname + +if TYPE_CHECKING: + from vllm.config.config import VllmConfig +else: + VllmConfig = object + +logger = init_logger(__name__) + + +class CompilationLevel: + # constants for the levels of the compilation process + NO_COMPILATION = 0 + DYNAMO_AS_IS = 1 + DYNAMO_ONCE = 2 + PIECEWISE = 3 + + +@config +@dataclass +class PassConfig: + """Configuration for custom Inductor passes. + + This is separate from general `CompilationConfig` so that inductor passes + don't all have access to full configuration - that would create a cycle as + the `PassManager` is set as a property of config.""" + + enable_fusion: bool = field(default_factory=lambda: not envs.VLLM_USE_V1) + """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass.""" + enable_attn_fusion: bool = False + """Whether to enable the custom attention+quant fusion pass.""" + enable_noop: bool = field(default_factory=lambda: not envs.VLLM_USE_V1) + """Whether to enable the custom no-op elimination pass.""" + enable_sequence_parallelism: bool = False + """Whether to enable sequence parallelism.""" + enable_async_tp: bool = False + """Whether to enable async TP.""" + enable_fi_allreduce_fusion: bool = False + """Whether to enable flashinfer allreduce fusion.""" + fi_allreduce_fusion_max_token_num: int = 16384 + """Max number of tokens to used in flashinfer allreduce fusion.""" + + # TODO(luka) better pass enabling system. + + def uuid(self): + """ + Produces a hash unique to the pass configuration. + Any new fields that affect compilation should be added to the hash. + Any future fields that don't affect compilation should be excluded. + """ + return InductorPass.hash_dict(asdict(self)) + + def __post_init__(self) -> None: + if not self.enable_noop: + if self.enable_fusion: + logger.warning_once( + "Fusion enabled but reshape elimination disabled. " + "RMSNorm/SiluMul + quant (fp8) fusion might not work") + if self.enable_attn_fusion: + logger.warning_once( + "Fusion enabled but reshape elimination disabled. " + "Attention + quant (fp8) fusion might not work") + + +@config +@dataclass +class CompilationConfig: + """Configuration for compilation. It has three parts: + + - Top-level Compilation control: + - [`level`][vllm.config.CompilationConfig.level] + - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path] + - [`cache_dir`][vllm.config.CompilationConfig.cache_dir] + - [`backend`][vllm.config.CompilationConfig.backend] + - [`custom_ops`][vllm.config.CompilationConfig.custom_ops] + - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops] + - CudaGraph capture: + - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph] + - [`cudagraph_capture_sizes`] + [vllm.config.CompilationConfig.cudagraph_capture_sizes] + - [`cudagraph_num_of_warmups`] + [vllm.config.CompilationConfig.cudagraph_num_of_warmups] + - [`cudagraph_copy_inputs`] + [vllm.config.CompilationConfig.cudagraph_copy_inputs] + - [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph] + - Inductor compilation: + - [`use_inductor`][vllm.config.CompilationConfig.use_inductor] + - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes] + - [`inductor_compile_config`] + [vllm.config.CompilationConfig.inductor_compile_config] + - [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes] + - custom inductor passes + + Why we have different sizes for cudagraph and inductor: + - cudagraph: a cudagraph captured for a specific size can only be used + for the same size. We need to capture all the sizes we want to use. + - inductor: a graph compiled by inductor for a general shape can be used + for different sizes. Inductor can also compile for specific sizes, + where it can have more information to optimize the graph with fully + static shapes. However, we find the general shape compilation is + sufficient for most cases. It might be beneficial to compile for + certain small batchsizes, where inductor is good at optimizing. + """ + # Top-level Compilation control + level: Optional[int] = None + """The level of compilation: + + - None: If None, we will select the default compilation level. + For V1 engine this is 3, for V0 engine this is 0. + - 0: no compilation. + - 1: dynamo as is. + - 2: dynamo once. + - 3: piecewise compilation.""" + debug_dump_path: str = "" + """The path to dump the debug information.""" + cache_dir: str = "" + """The directory to store the compiled graph, to accelerate Inductor + compilation. By default, it will use model-related information to generate + a cache directory.""" + backend: str = "" + """The backend for compilation. It needs to be a string: + + - "" (empty string): use the default backend. + - "eager"/"openxla"/...: use the specified backend registered in PyTorch. + - "full.module.name": a qualified name which can be used to import the + + backend function. + We use string to avoid serialization issues when using compilation in a + distributed setting. When the compilation level is 1 or 2, the backend is + used for the compilation directly (it sees the whole graph). When the + compilation level is 3, the backend is used for the piecewise compilation + (it sees a part of the graph).""" + custom_ops: list[str] = field(default_factory=list) + """Fine-grained control over which custom ops to enable/disable. Use 'all' + to enable all, 'none' to disable all. Also specify a list of custom op + names to enable (prefixed with a '+'), or disable (prefixed with a '-'). + Examples: + + - 'all,-op1' to enable all except op1 + - 'none,+op1,+op2' to enable only op1 and op2 + + By default, all custom ops are enabled when running without Inductor and + disabled when running with Inductor: level>=PIECEWISE and use_inductor=True. + Inductor generates (fused) Triton kernels for disabled custom ops.""" + splitting_ops: list[str] = field(default_factory=list) + """A list of ops to split the full graph into subgraphs, used in piecewise + compilation.""" + + # Inductor capture + use_inductor: bool = True + """Whether to use inductor compilation: + + - False: inductor compilation is not used. graph runs in eager + (custom_ops enabled by default). + - True: inductor compilation is used (custom_ops disabled by default). + One graph for symbolic shape and one graph per size in compile_sizes + are compiled using configurations in inductor_compile_config. + + This setting is ignored if level1.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: list[Any] = [] + factors.append(self.level) + factors.append(self.backend) + factors.append(self.custom_ops) + factors.append(self.splitting_ops) + factors.append(self.use_inductor) + factors.append(self.inductor_compile_config) + factors.append(self.inductor_passes) + factors.append(self.pass_config.uuid()) + return hashlib.sha256(str(factors).encode()).hexdigest() + + def __repr__(self) -> str: + exclude = { + "static_forward_context": True, + "enabled_custom_ops": True, + "disabled_custom_ops": True, + "compilation_time": True, + "bs_to_padded_graph_size": True, + "traced_files": True, + "inductor_compile_config": { + "post_grad_custom_post_pass": True, + }, + } + + # exclude default attr in pass_config + pass_config_exclude = {} + for attr, default_val in vars(PassConfig()).items(): + if getattr(self.pass_config, attr) == default_val: + pass_config_exclude[attr] = True + if pass_config_exclude: + exclude["pass_config"] = pass_config_exclude + + return TypeAdapter(CompilationConfig).dump_json( + self, + exclude=exclude, # type: ignore[arg-type] + exclude_unset=True).decode() + + __str__ = __repr__ + + def __post_init__(self) -> None: + count_none = self.custom_ops.count("none") + count_all = self.custom_ops.count("all") + assert count_none + count_all <= 1, "Can only specify 'none' or 'all'" + + # TODO(zou3519/luka): There are 2 issues with auto-functionalization V2: + # 1. A bug in PyTorch, fixed in 2.7: + # https://github.com/pytorch/pytorch/issues/147924 + # 2. Custom passes (fusion) rely on auto-functionalization V1 and don't + # work with V2. Addressing this will take extra engineering effort + # and it is not yet a priority. RFC here: + # https://github.com/vllm-project/vllm/issues/14703 + + if is_torch_equal_or_newer("2.6"): + KEY = 'enable_auto_functionalized_v2' + if KEY not in self.inductor_compile_config: + self.inductor_compile_config[KEY] = False + + for k, v in self.inductor_passes.items(): + if not isinstance(v, str): + assert callable(v), ( + f"pass {k} should be callable or a qualified name") + self.inductor_compile_config[k] = v if isinstance( + v, InductorPass) else CallableInductorPass(v) + continue + + # resolve function from qualified name + names = v.split(".") + module = ".".join(names[:-1]) + func_name = names[-1] + func = __import__(module).__dict__[func_name] + self.inductor_compile_config[k] = func if isinstance( + func, InductorPass) else CallableInductorPass(func) + + if isinstance(self.pass_config, dict): + self.pass_config = PassConfig(**self.pass_config) + + def init_backend(self, vllm_config: VllmConfig) -> Union[str, Callable]: + if self.level == CompilationLevel.NO_COMPILATION: + raise ValueError("No compilation level is set.") + + from torch._dynamo.backends.registry import list_backends + torch_backends = list_backends(exclude_tags=tuple()) + if self.level in [ + CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE + ]: + if self.backend == "": + return "eager" + if self.backend in torch_backends: + return self.backend + return resolve_obj_by_qualname(self.backend) + + # TODO: pass user-specified backend to piecewise compilation + # merge with the config use_inductor + assert self.level == CompilationLevel.PIECEWISE + + from vllm.compilation.backends import VllmBackend + return VllmBackend(vllm_config) + + def init_with_cudagraph_sizes(self, + cudagraph_capture_sizes: list[int]) -> None: + """To complete the initialization of config, + we need to know the cudagraph sizes.""" + + if self.cudagraph_capture_sizes is None: + self.cudagraph_capture_sizes = cudagraph_capture_sizes + else: + # de-duplicate the sizes provided by the config + dedup_sizes = list(set(self.cudagraph_capture_sizes)) + if len(dedup_sizes) < len(self.cudagraph_capture_sizes): + logger.info(("cudagraph sizes specified by model runner" + " %s is overridden by config %s"), + cudagraph_capture_sizes, dedup_sizes) + self.cudagraph_capture_sizes = dedup_sizes + + computed_compile_sizes = [] + if self.compile_sizes is not None: + # de-duplicate the sizes provided by the config + self.compile_sizes = list(set(self.compile_sizes)) + for x in self.compile_sizes: + if isinstance(x, str): + assert x == "cudagraph_capture_sizes", \ + "Unrecognized size type in compile_sizes, " \ + f"expect 'cudagraph_capture_sizes', got {x}" + computed_compile_sizes.extend(self.cudagraph_capture_sizes) + else: + assert isinstance(x, int) + computed_compile_sizes.append(x) + self.compile_sizes = computed_compile_sizes # type: ignore + + # sort to make sure cudagraph capture sizes are in descending order + self.cudagraph_capture_sizes.sort(reverse=True) + self.max_capture_size = self.cudagraph_capture_sizes[ + 0] if self.cudagraph_capture_sizes else 0 + + # pre-compute the mapping from batch size to padded graph size + self.bs_to_padded_graph_size = [ + 0 for i in range(self.max_capture_size + 1) + ] + for end, start in zip(self.cudagraph_capture_sizes, + self.cudagraph_capture_sizes[1:] + [0]): + for bs in range(start, end): + if bs == start: + self.bs_to_padded_graph_size[bs] = start + else: + self.bs_to_padded_graph_size[bs] = end + self.bs_to_padded_graph_size[ + self.max_capture_size] = self.max_capture_size + + def set_splitting_ops_for_v1(self): + # NOTE: this function needs to be called + if self.splitting_ops and self.full_cuda_graph: + raise ValueError("full_cuda_graph cannot be used together with " + "splitting_ops, as Full CUDA graph will override " + f"the splitting_ops: {self.splitting_ops}") + + if not self.splitting_ops: + self.splitting_ops = [] if self.full_cuda_graph else [ + "vllm.unified_attention", + "vllm.unified_attention_with_output", + "vllm.mamba_mixer2", + ] diff --git a/vllm/config/utils.py b/vllm/config/utils.py new file mode 100644 index 0000000000..98fbeb1fa8 --- /dev/null +++ b/vllm/config/utils.py @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import TYPE_CHECKING, TypeVar + +if TYPE_CHECKING: + from _typeshed import DataclassInstance + + ConfigType = type[DataclassInstance] +else: + ConfigType = type + +ConfigT = TypeVar("ConfigT", bound=ConfigType) + + +def config(cls: ConfigT) -> ConfigT: + """ + A decorator that ensures all fields in a dataclass have default values + and that each field has a docstring. + + If a `ConfigT` is used as a CLI argument itself, the `type` keyword argument + provided by `get_kwargs` will be + `pydantic.TypeAdapter(ConfigT).validate_json(cli_arg)` which treats the + `cli_arg` as a JSON string which gets validated by `pydantic`. + + Config validation is performed by the tools/validate_config.py + script, which is invoked during the pre-commit checks. + """ + return cls diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c0ac3ff631..c9dc99cad2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -193,8 +193,6 @@ Additionally, list elements can be passed individually using `+`: def parse_dataclass(val: str, cls=dataclass_cls) -> Any: try: - if hasattr(cls, "from_cli"): - return cls.from_cli(val) return TypeAdapter(cls).validate_json(val) except ValidationError as e: raise argparse.ArgumentTypeError(repr(e)) from e @@ -455,9 +453,9 @@ class EngineArgs: # support `EngineArgs(compilation_config={...})` # without having to manually construct a # CompilationConfig object - if isinstance(self.compilation_config, (int, dict)): - self.compilation_config = CompilationConfig.from_cli( - str(self.compilation_config)) + if isinstance(self.compilation_config, dict): + self.compilation_config = CompilationConfig( + **self.compilation_config) # Setup plugins from vllm.plugins import load_general_plugins load_general_plugins() From 311d875614583b7070d16c786c791a3817a8c10a Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 8 Aug 2025 19:56:47 -0400 Subject: [PATCH 104/932] Drop flaky test_healthcheck_response_time (#22539) Signed-off-by: Russell Bryant --- .../openai/test_async_tokenization.py | 54 ------------------- 1 file changed, 54 deletions(-) diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py index ab3c809054..80261597b1 100644 --- a/tests/entrypoints/openai/test_async_tokenization.py +++ b/tests/entrypoints/openai/test_async_tokenization.py @@ -2,15 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio -import contextlib import random -import time from typing import Callable import openai import pytest import pytest_asyncio -import requests from tests.utils import RemoteOpenAIServer @@ -87,54 +84,3 @@ async def test_with_and_without_truncate( responses = await asyncio.gather(*[get_status_code(**b) for b in bodies]) assert 500 not in responses - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - ids=["single completion", "multiple completions", "chat"], - argnames=["create_func_gen", "content_body"], - argvalues=[ - (lambda x: x.completions.create, { - "prompt": " ".join(['A'] * 300_000) - }), - (lambda x: x.completions.create, { - "prompt": [" ".join(['A'] * 300_000)] * 2 - }), - (lambda x: x.chat.completions.create, { - "messages": [{ - "role": "user", - "content": " ".join(['A'] * 300_000) - }] - }), - ], -) -async def test_healthcheck_response_time( - server: RemoteOpenAIServer, - client: openai.AsyncOpenAI, - create_func_gen: Callable, - content_body: dict, -): - num_requests = 50 - - create_func = create_func_gen(client) - body = {"model": MODEL_NAME, **content_body, "max_tokens": 10} - - def get_response_time(url): - start_time = time.monotonic() - res = requests.get(url) - end_time = time.monotonic() - assert res.status_code == 200 - return end_time - start_time - - no_load_response_time = get_response_time(server.url_for("health")) - tasks = [ - asyncio.create_task(create_func(**body)) for _ in range(num_requests) - ] - await asyncio.sleep(1) # give the tasks a chance to start running - load_response_time = get_response_time(server.url_for("health")) - - with contextlib.suppress(openai.APIStatusError): - await asyncio.gather(*tasks) - - assert load_response_time < 100 * no_load_response_time - assert load_response_time < 0.1 From 81c57f60a2c77d169dbec021bb58a467edf580f6 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 9 Aug 2025 08:03:45 +0800 Subject: [PATCH 105/932] [XPU] upgrade torch 2.8 on for XPU (#22300) Signed-off-by: Kunshang Ji --- docker/Dockerfile.xpu | 17 +++++++++++------ requirements/xpu.txt | 11 +++-------- vllm/plugins/__init__.py | 9 --------- vllm/v1/worker/xpu_worker.py | 2 +- 4 files changed, 15 insertions(+), 24 deletions(-) diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index 7d5a589eb1..65d2e5036b 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -1,9 +1,12 @@ -# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually. -FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base +FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base RUN rm /etc/apt/sources.list.d/intel-graphics.list -RUN apt-get update -y && \ +RUN apt clean && apt-get update -y && \ + apt-get install -y software-properties-common && \ + add-apt-repository ppa:deadsnakes/ppa && \ + apt-get install -y python3.10 python3.10-distutils && \ + curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \ apt-get install -y --no-install-recommends --fix-missing \ curl \ ffmpeg \ @@ -14,11 +17,13 @@ RUN apt-get update -y && \ libgl1 \ lsb-release \ numactl \ - python3 \ - python3-dev \ - python3-pip \ + python3.10-dev \ wget + +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 + WORKDIR /workspace/vllm COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt COPY requirements/common.txt /workspace/vllm/requirements/common.txt diff --git a/requirements/xpu.txt b/requirements/xpu.txt index 0d95dc5715..4607c3efdf 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -10,15 +10,10 @@ wheel jinja2>=3.1.6 datasets # for benchmark scripts numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding - -torch==2.7.0+xpu +--extra-index-url=https://download.pytorch.org/whl/xpu +torch==2.8.0+xpu torchaudio torchvision pytorch-triton-xpu ---extra-index-url=https://download.pytorch.org/whl/xpu - -# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu -# FIXME: This will be fix in ipex 2.7. just leave this here for awareness. -intel-extension-for-pytorch==2.7.10+xpu -oneccl_bind_pt==2.7.0+xpu --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +intel-extension-for-pytorch==2.8.10+xpu diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 51c78ddc1a..1a1760df82 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -4,8 +4,6 @@ import logging from typing import Any, Callable -import torch - import vllm.envs as envs logger = logging.getLogger(__name__) @@ -68,13 +66,6 @@ def load_general_plugins(): return plugins_loaded = True - # some platform-specific configurations - from vllm.platforms import current_platform - - if current_platform.is_xpu(): - # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158 - torch._dynamo.config.disable = True - plugins = load_plugins_by_group(group=DEFAULT_PLUGINS_GROUP) # general plugins, we only need to execute the loaded functions for func in plugins.values(): diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index 2a7e0625b2..134d839252 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -152,7 +152,7 @@ class XPUWorker(Worker): raise RuntimeError( f"Not support device type: {self.device_config.device}") - ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "drmfd") + ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "pidfd") ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi") ENV_LOCAL_WORLD_SIZE = os.getenv("LOCAL_WORLD_SIZE", str(self.parallel_config.world_size)) From 35afe1b30b154114dc2ee8329e12f8cf3fe9f576 Mon Sep 17 00:00:00 2001 From: Pradyun92 <142861237+Pradyun92@users.noreply.github.com> Date: Fri, 8 Aug 2025 20:04:15 -0400 Subject: [PATCH 106/932] [BugFix] [P/D] Handle lookahead token count edge-case with Eagle Spec Decoding and P/D (#22317) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Pradyun Ramadorai Signed-off-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com> Co-authored-by: Pradyun Ramadorai Co-authored-by: Nicolò Lucchesi --- vllm/v1/core/sched/scheduler.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index d39aea1f2d..430085d9c9 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -437,14 +437,24 @@ class Scheduler(SchedulerInterface): # The request cannot be scheduled. break + # Handles an edge case when P/D Disaggregation + # is used with Spec Decoding where an + # extra block gets allocated which + # creates a mismatch between the number + # of local and remote blocks. + effective_lookahead_tokens = (0 if request.num_computed_tokens + == 0 else + self.num_lookahead_tokens) + new_blocks = self.kv_cache_manager.allocate_slots( request, num_new_tokens + num_external_computed_tokens, num_new_local_computed_tokens, new_computed_blocks, - num_lookahead_tokens=self.num_lookahead_tokens, + num_lookahead_tokens=effective_lookahead_tokens, delay_cache_blocks=load_kv_async, ) + if new_blocks is None: # The request cannot be scheduled. break From 429e4e2d420f7c648d37b7d90430f5df6a7dc61f Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 9 Aug 2025 13:17:22 +0800 Subject: [PATCH 107/932] [Bugfix] Fix ModernBert cuda graph capturing in v1 (#21901) Signed-off-by: Isotr0py Signed-off-by: Isotr0py <2037008807@qq.com> --- tests/models/language/pooling/mteb_utils.py | 5 ++- vllm/model_executor/models/bert.py | 2 +- vllm/model_executor/models/bert_with_rope.py | 46 +++++++++----------- vllm/model_executor/models/modernbert.py | 22 +++++----- vllm/model_executor/models/roberta.py | 6 +-- 5 files changed, 39 insertions(+), 42 deletions(-) diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 8c93bbdc98..77aaddb4f5 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -162,7 +162,8 @@ def mteb_test_embed_models(hf_runner, vllm_runner, model_info: EmbedModelInfo, vllm_extra_kwargs=None, - hf_model_callback=None): + hf_model_callback=None, + atol=MTEB_RERANK_TOL): if not model_info.enable_test: # A model family has many models with the same architecture, # and we don't need to test each one. @@ -198,7 +199,7 @@ def mteb_test_embed_models(hf_runner, print("SentenceTransformers:", st_dtype, st_main_score) print("Difference:", st_main_score - vllm_main_score) - assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL) + assert st_main_score == pytest.approx(vllm_main_score, abs=atol) def run_mteb_rerank(cross_encoder, tasks, languages): diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 504621c8ab..8f988903f7 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -466,7 +466,7 @@ class BertEmbeddingModel(nn.Module, SupportsQuant): def forward( self, - input_ids: Optional[torch.Tensor], + input_ids: torch.Tensor, positions: torch.Tensor, token_type_ids: Optional[torch.Tensor] = None, intermediate_tensors: Optional[IntermediateTensors] = None, diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 59033cb74a..050f18f16e 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -8,13 +8,15 @@ from torch import nn from transformers import PretrainedConfig from vllm.attention import Attention, AttentionType +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) from vllm.model_executor.layers.activation import (get_act_and_mul_fn, get_act_fn) -from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.fused_moe.fused_moe import ( + fused_topk, torch_vllm_outplace_fused_experts) from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, @@ -284,15 +286,22 @@ class NomicMoE(nn.Module): hidden_states = hidden_states.view(-1, self.hidden_size) # router_logits: (num_tokens, n_experts) router_logits, _ = self.router(hidden_states) - final_hidden_states = fused_moe(hidden_states, - self.w1, - self.w2, - router_logits, - self.top_k, - renormalize=False, - inplace=False, - activation=self.hidden_act, - is_act_and_mul=False) + # FIXME(Isotr0py): This implementation is too tricky, + # we should use FusedMoE instead in the future + # after supporting ungated activation for it. + topk_weights, topk_ids, _ = fused_topk(hidden_states, + router_logits, + self.top_k, + renormalize=False) + final_hidden_states = torch_vllm_outplace_fused_experts( + hidden_states=hidden_states, + w1=self.w1, + w2=self.w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + activation=self.hidden_act, + is_act_and_mul=False, + ) if self.tp_size > 1: final_hidden_states = tensor_model_parallel_all_reduce( @@ -391,6 +400,7 @@ class BertWithRopeEncoder(nn.Module): return hidden_states +@support_torch_compile class BertWithRope(nn.Module, SupportsQuant): hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) @@ -407,7 +417,7 @@ class BertWithRope(nn.Module, SupportsQuant): def forward( self, - input_ids: Optional[torch.Tensor], + input_ids: torch.Tensor, positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -554,20 +564,6 @@ class JinaRobertaModel(BertWithRope): "norm2": "mlp_ln", }) - def forward( - self, - input_ids: torch.Tensor, - position_ids: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - token_type_ids: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - return super().forward(input_ids=input_ids, - positions=position_ids, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds, - token_type_ids=token_type_ids) - @torch.inference_mode() def jina_merge_lora_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 4967032a24..761fce815e 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -8,6 +8,7 @@ from torch import nn from transformers import ModernBertConfig from vllm.attention import Attention, AttentionType +from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.linear import (QKVParallelLinear, @@ -46,7 +47,7 @@ class ModernBertEmbeddings(nn.Module): input_ids: torch.Tensor, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - if inputs_embeds: + if inputs_embeds is not None: return self.norm(inputs_embeds) else: inputs_embeds = self.tok_embeddings(input_ids) @@ -117,7 +118,7 @@ class ModernBertAttention(nn.Module): def forward( self, hidden_states: torch.Tensor, - position_ids: Optional[torch.LongTensor] = None, + position_ids: torch.Tensor, ) -> torch.Tensor: qkv, _ = self.Wqkv(hidden_states) q, k, v = qkv.split([self.all_head_size] * 3, dim=-1) @@ -169,9 +170,9 @@ class ModernBertLayer(nn.Module): def forward( self, hidden_states: torch.Tensor, - position_ids: Optional[torch.LongTensor] = None, - ): - attn_outputs = self.attn(self.attn_norm(hidden_states), + position_ids: torch.Tensor, + ) -> torch.Tensor: + attn_outputs = self.attn(hidden_states=self.attn_norm(hidden_states), position_ids=position_ids) hidden_states = hidden_states + attn_outputs mlp_output = self.mlp(self.mlp_norm(hidden_states)) @@ -192,13 +193,14 @@ class ModernBertEncoderLayer(nn.Module): def forward( self, hidden_states: torch.Tensor, - position_ids: Optional[torch.LongTensor] = None, + position_ids: torch.Tensor, ) -> torch.Tensor: for i, layer in enumerate(self.layers): hidden_states = layer(hidden_states, position_ids) return hidden_states +@support_torch_compile class ModernBertModel(nn.Module): hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={"layers.": "encoder_layer.layers."}) @@ -234,13 +236,11 @@ class ModernBertModel(nn.Module): def forward( self, - input_ids: Optional[torch.LongTensor] = None, - positions: Optional[torch.Tensor] = None, + input_ids: torch.Tensor, + positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, ) -> torch.Tensor: - position_ids = positions if positions is not None else position_ids if inputs_embeds is not None: hidden_states = inputs_embeds else: @@ -249,7 +249,7 @@ class ModernBertModel(nn.Module): outputs = self.encoder_layer( hidden_states=hidden_states, - position_ids=position_ids, + position_ids=positions, ) norm_outputs = self.final_norm(outputs) return norm_outputs diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 77e072c792..61c8faed40 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -105,7 +105,7 @@ class RobertaEmbeddingModel(BertEmbeddingModel): def forward( self, - input_ids: Optional[torch.Tensor], + input_ids: torch.Tensor, positions: torch.Tensor, token_type_ids: Optional[torch.Tensor] = None, intermediate_tensors: Optional[IntermediateTensors] = None, @@ -119,8 +119,8 @@ class RobertaEmbeddingModel(BertEmbeddingModel): position_ids=positions, padding_idx=self.padding_idx) - return self.model(input_ids=input_ids, - position_ids=positions, + return self.model(input_ids, + positions, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, intermediate_tensors=intermediate_tensors) From 08b751ba749541259e5450d6371d822fdf769b8a Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 8 Aug 2025 22:21:40 -0700 Subject: [PATCH 108/932] Implicit language-model-only mode via limit-mm-per-prompt (#22299) Signed-off-by: Roger Wang Signed-off-by: Andy Xie Signed-off-by: tjtanaa Signed-off-by: Andrew Sansom Signed-off-by: Zhiyu Cheng Signed-off-by: Shu Wang Signed-off-by: Po-Han Huang Signed-off-by: Shu Wang. Signed-off-by: XIn Li Signed-off-by: Junhao Li Signed-off-by: chaunceyjiang Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> Signed-off-by: zitian.zhao Signed-off-by: zitian zhao Signed-off-by: DarkLight1337 Signed-off-by: iAmir97 Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com> Signed-off-by: Linkun Co-authored-by: Ning Xie Co-authored-by: TJian Co-authored-by: Andrew Sansom Co-authored-by: Zhiyu Co-authored-by: Shu Wang Co-authored-by: XIn Li Co-authored-by: Junhao Li Co-authored-by: Chauncey Co-authored-by: Yuxuan Zhang <2448370773@qq.com> Co-authored-by: ZiTian Zhao Co-authored-by: Cyrus Leung Co-authored-by: Po-Han Huang (NVIDIA) <53919306+nvpohanh@users.noreply.github.com> Co-authored-by: iAmir97 <71513472+iAmir97@users.noreply.github.com> Co-authored-by: iAmir97 Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung Co-authored-by: Hong Hanh Co-authored-by: youkaichao Co-authored-by: lkchen --- tests/multimodal/test_registry.py | 38 ++++++++++++ vllm/config/__init__.py | 9 --- vllm/model_executor/models/llava.py | 34 +++++++---- vllm/model_executor/models/mistral3.py | 38 +++++++----- vllm/model_executor/models/mllama4.py | 30 ++++++---- .../models/qwen2_5_omni_thinker.py | 33 +++++++--- vllm/model_executor/models/qwen2_5_vl.py | 22 ++++--- vllm/model_executor/models/qwen2_vl.py | 26 +++++--- vllm/model_executor/models/step3_vl.py | 60 ++++++++++++------- vllm/multimodal/registry.py | 39 ++++++++++++ vllm/v1/core/encoder_cache_manager.py | 2 +- vllm/v1/engine/core.py | 3 +- vllm/v1/engine/mm_input_cache.py | 12 ++-- vllm/v1/engine/processor.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 16 ++--- vllm/v1/worker/tpu_model_runner.py | 23 ++++--- 16 files changed, 271 insertions(+), 116 deletions(-) create mode 100644 tests/multimodal/test_registry.py diff --git a/tests/multimodal/test_registry.py b/tests/multimodal/test_registry.py new file mode 100644 index 0000000000..d31e75bc27 --- /dev/null +++ b/tests/multimodal/test_registry.py @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Unit tests for MultiModalRegistry.supports_multimodal_inputs and +Qwen2.5-VL visual component loading behavior. +""" + +import pytest + +from vllm.multimodal import MULTIMODAL_REGISTRY + +from ..models.utils import build_model_context + + +@pytest.mark.parametrize( + "model_id,limit_mm_per_prompt,expected", + [ + ("Qwen/Qwen2-0.5B-Instruct", {}, False), + ("Qwen/Qwen2.5-VL-3B-Instruct", {}, True), + ("Qwen/Qwen2.5-VL-3B-Instruct", { + "image": 0, + "video": 0 + }, False), + ("Qwen/Qwen2.5-VL-3B-Instruct", { + "image": 0 + }, True), + ], +) +@pytest.mark.core_model +def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected): + """Test supports_multimodal_inputs returns correct boolean for various + configs.""" + ctx = build_model_context( + model_id, + limit_mm_per_prompt=limit_mm_per_prompt, + ) + assert MULTIMODAL_REGISTRY.supports_multimodal_inputs( + ctx.model_config) is expected \ No newline at end of file diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index eaed6017cc..69c05b75d3 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1695,15 +1695,6 @@ class ModelConfig: return mm_config.mm_processor_cache_gb > 0 - @property - def enable_mm_input_cache(self) -> bool: - """Whether the multi-modal input cache should be enabled.""" - mm_config = self.multimodal_config - if mm_config is None: - return False - - return mm_config.mm_processor_cache_gb > 0 - def get_mm_input_cache_gb(self) -> int: mm_config = self.multimodal_config if mm_config is None: diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index c863ba4064..cfc6ffd99a 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -521,18 +521,22 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): config.projector_hidden_act = "gelu" # TODO: Optionally initializes this for supporting embeddings. - self.vision_tower = init_vision_tower_for_llava( - config, - quant_config, - require_post_norm=False, - prefix=maybe_prefix(prefix, "vision_tower")) - self.multi_modal_projector = LlavaMultiModalProjector( - vision_hidden_size=config.vision_config.hidden_size, - text_hidden_size=config.text_config.hidden_size, - projector_hidden_act=config.projector_hidden_act, - multimodal_projector_bias=config.multimodal_projector_bias, - quant_config=quant_config, - prefix=maybe_prefix(prefix, "multi_modal_projector")) + if multimodal_config.get_limit_per_prompt("image"): + self.vision_tower = init_vision_tower_for_llava( + config, + quant_config, + require_post_norm=False, + prefix=maybe_prefix(prefix, "vision_tower")) + self.multi_modal_projector = LlavaMultiModalProjector( + vision_hidden_size=config.vision_config.hidden_size, + text_hidden_size=config.text_config.hidden_size, + projector_hidden_act=config.projector_hidden_act, + multimodal_projector_bias=config.multimodal_projector_bias, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "multi_modal_projector")) + else: + self.vision_tower = None + self.multi_modal_projector = None self.language_model = init_vllm_registered_model( vllm_config=vllm_config, @@ -756,7 +760,11 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) + skip_prefixes = [] + if self.vision_tower is None and self.multi_modal_projector is None: + skip_prefixes.extend(["vision_tower.", "multi_modal_projector."]) + + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 88c3823eaa..9e29a96c6e 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -428,20 +428,24 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, config.projector_hidden_act = "gelu" # TODO: Optionally initializes this for supporting embeddings. - self.vision_tower = init_vision_tower_for_llava( - config, - quant_config, - require_post_norm=False, - prefix=maybe_prefix(prefix, "vision_tower")) - self.multi_modal_projector = Mistral3MultiModalProjector( - vision_hidden_size=config.vision_config.hidden_size, - text_hidden_size=config.text_config.hidden_size, - projector_hidden_act=config.projector_hidden_act, - spatial_merge_size=config.spatial_merge_size, - patch_size=config.vision_config.patch_size, - multimodal_projector_bias=config.multimodal_projector_bias, - quant_config=quant_config, - prefix=maybe_prefix(prefix, "multi_modal_projector")) + if multimodal_config.get_limit_per_prompt("image"): + self.vision_tower = init_vision_tower_for_llava( + config, + quant_config, + require_post_norm=False, + prefix=maybe_prefix(prefix, "vision_tower")) + self.multi_modal_projector = Mistral3MultiModalProjector( + vision_hidden_size=config.vision_config.hidden_size, + text_hidden_size=config.text_config.hidden_size, + projector_hidden_act=config.projector_hidden_act, + spatial_merge_size=config.spatial_merge_size, + patch_size=config.vision_config.patch_size, + multimodal_projector_bias=config.multimodal_projector_bias, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "multi_modal_projector")) + else: + self.vision_tower = None + self.multi_modal_projector = None self.language_model = init_vllm_registered_model( vllm_config=vllm_config, @@ -611,7 +615,11 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) + skip_prefixes = [] + if self.vision_tower is None and self.multi_modal_projector is None: + skip_prefixes = ["vision_tower.", "multi_modal_projector."] + + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) def get_mm_mapping(self) -> MultiModelKeys: diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index e73dc0c2be..b405dfca6d 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -737,16 +737,20 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, self.config = config self.quant_config = quant_config self.multimodal_config = multimodal_config - self.vision_model = Llama4VisionModel( - config.vision_config, - None, - prefix=maybe_prefix(prefix, "vision_model"), - use_data_parallel=self.use_data_parallel, - ) - self.multi_modal_projector = Llama4MultiModalProjector( - self.config, - None, - prefix=maybe_prefix(prefix, "multi_modal_projector")) + if multimodal_config.get_limit_per_prompt("image"): + self.vision_model = Llama4VisionModel( + config.vision_config, + None, + prefix=maybe_prefix(prefix, "vision_model"), + use_data_parallel=self.use_data_parallel, + ) + self.multi_modal_projector = Llama4MultiModalProjector( + self.config, + None, + prefix=maybe_prefix(prefix, "multi_modal_projector")) + else: + self.vision_model = None + self.multi_modal_projector = None self.language_model = initialize_model( vllm_config=vllm_config.with_hf_config(config.text_config, ["LlamaForCausalLM"]), @@ -783,6 +787,8 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, def _process_image_input( self, image_input: Llama4ImagePatchInputs) -> MultiModalEmbeddings: + + assert self.vision_model and self.multi_modal_projector flat_data = image_input["flat_data"] patches_per_image = image_input["patches_per_image"].tolist() @@ -1048,6 +1054,10 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, language_model_weights, other_weights = ( self._separate_and_rename_weights(weights)) + # Skip loading vision model and projector if they're not initialized. + if self.vision_model is None and self.multi_modal_projector is None: + other_weights = [] + # Handle expert scale parameters regular_weights, expert_scale_weights, updated_params_from_experts = ( self._handle_expert_scale_broadcasting(language_model_weights, diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index a3af541d20..e95295c318 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -722,13 +722,24 @@ class Qwen2_5OmniThinkerForConditionalGeneration( "exactly same result as the transformers implementation " "in the audio tower part.") - self.audio_tower = Qwen2_5OmniAudioEncoder(thinker_config.audio_config) - self.visual = Qwen2_5_VisionTransformer( - vision_config=thinker_config.vision_config, - norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6), - quant_config=quant_config, - prefix=maybe_prefix(prefix, "visual"), - ) + if multimodal_config.get_limit_per_prompt("audio"): + self.audio_tower = Qwen2_5OmniAudioEncoder( + thinker_config.audio_config) + else: + self.audio_tower = None + + if multimodal_config.get_limit_per_prompt( + "image") or multimodal_config.get_limit_per_prompt("video"): + self.visual = Qwen2_5_VisionTransformer( + vision_config=thinker_config.vision_config, + norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", + 1e-6), + quant_config=quant_config, + prefix=maybe_prefix(prefix, "visual"), + ) + else: + self.visual = None + self.quant_config = quant_config self.language_model = init_vllm_registered_model( vllm_config=vllm_config, @@ -886,9 +897,15 @@ class Qwen2_5OmniThinkerForConditionalGeneration( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + skip_prefixes = ["talker.", "token2wav."] + if self.audio_tower is None: + skip_prefixes.extend(["audio_tower."]) + if self.visual is None: + skip_prefixes.extend(["visual."]) + loader = AutoWeightsLoader( self, - skip_prefixes=["talker.", "token2wav."], + skip_prefixes=skip_prefixes, ) loaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 79c5c77f6d..6bea180ffe 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -843,12 +843,17 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, self.config = config self.multimodal_config = multimodal_config - self.visual = Qwen2_5_VisionTransformer( - config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=self._maybe_ignore_quant_config(self.quant_config), - prefix=maybe_prefix(prefix, "visual"), - ) + if multimodal_config.get_limit_per_prompt("image") or \ + multimodal_config.get_limit_per_prompt("video"): + self.visual = Qwen2_5_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=self._maybe_ignore_quant_config( + self.quant_config), + prefix=maybe_prefix(prefix, "visual"), + ) + else: + self.visual = None self.language_model = init_vllm_registered_model( vllm_config=vllm_config, @@ -1152,7 +1157,10 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) + skip_prefixes = [] + if self.visual is None: + skip_prefixes.extend(["visual."]) + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) def get_mm_mapping(self) -> MultiModelKeys: diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 633f8598e8..f2d438b385 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1049,12 +1049,16 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, self.config = config self.multimodal_config = multimodal_config - self.visual = Qwen2VisionTransformer( - config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=self._maybe_ignore_quant_config(quant_config), - prefix=maybe_prefix(prefix, "visual"), - ) + if multimodal_config.get_limit_per_prompt("image") or \ + multimodal_config.get_limit_per_prompt("video"): + self.visual = Qwen2VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=self._maybe_ignore_quant_config(quant_config), + prefix=maybe_prefix(prefix, "visual"), + ) + else: + self.visual = None self.language_model = init_vllm_registered_model( vllm_config=vllm_config, @@ -1350,7 +1354,10 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) + skip_prefixes = [] + if self.visual is None: + skip_prefixes.extend(["visual."]) + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) def get_mm_mapping(self) -> MultiModelKeys: @@ -1445,5 +1452,8 @@ class Tarsier2ForConditionalGeneration(Qwen2VLForConditionalGeneration): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) + skip_prefixes = [] + if self.visual is None: + skip_prefixes.extend(["visual."]) + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 363c12a4bf..41dba312cb 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -837,27 +837,35 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, self.config = config self.multimodal_config = multimodal_config - self.vision_model = Step3VisionTransformer(config.vision_config, - None, - prefix=maybe_prefix( - prefix, "vision_model")) - self.vit_downsampler = nn.Conv2d( - config.vision_config.hidden_size, - config.vision_config.output_hidden_size, - kernel_size=2, - stride=config.understand_projector_stride) - self.vit_downsampler2 = nn.Conv2d( - config.vision_config.output_hidden_size, - config.vision_config.output_hidden_size * 2, - kernel_size=3, - stride=2, - padding=1, - ) - self.vit_large_projector = nn.Linear( - config.vision_config.output_hidden_size * 2, - config.hidden_size, - bias=config.projector_bias, - ) + if multimodal_config.get_limit_per_prompt("image"): + self.vision_model = Step3VisionTransformer(config.vision_config, + None, + prefix=maybe_prefix( + prefix, + "vision_model")) + self.vit_downsampler = nn.Conv2d( + config.vision_config.hidden_size, + config.vision_config.output_hidden_size, + kernel_size=2, + stride=config.understand_projector_stride) + self.vit_downsampler2 = nn.Conv2d( + config.vision_config.output_hidden_size, + config.vision_config.output_hidden_size * 2, + kernel_size=3, + stride=2, + padding=1, + ) + self.vit_large_projector = nn.Linear( + config.vision_config.output_hidden_size * 2, + config.hidden_size, + bias=config.projector_bias, + ) + else: + self.vision_model = None + self.vit_downsampler = None + self.vit_downsampler2 = None + self.vit_large_projector = None + self.language_model = init_vllm_registered_model( vllm_config=vllm_config, hf_config=config.text_config, @@ -1046,7 +1054,15 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, return self.language_model.sample(logits, sampling_metadata) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): - loader = AutoWeightsLoader(self) + + skip_prefixes = [] + if self.vision_model is None and self.vit_large_projector is None: + skip_prefixes = [ + "vision_model.", "vit_downsampler.", "vit_downsampler2.", + "vit_large_projector." + ] + + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) loaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) return loaded_weights diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 565d54e1a2..a101f2a55f 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -115,6 +115,45 @@ class MultiModalRegistry: return True # Success + def enable_mm_input_cache(self, model_config: "ModelConfig") -> bool: + """Whether the multi-modal input cache should be enabled. + NOTE: This is put under MultiModalRegistry on purpose to respect + text-only mode for multimodal models. + """ + + if not self.supports_multimodal_inputs(model_config): + return False + + mm_config = model_config.get_multimodal_config() + + return mm_config.mm_processor_cache_gb > 0 + + def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool: + """ + Checks if the model supports multimodal inputs. + Returns True if the model is multimodal with any non-zero supported + modalities, otherwise returns False, effectively running in + text-only mode. + """ + if not model_config.is_multimodal_model: + return False + + processor = self.create_processor(model_config, disable_cache=False) + supported_modalities = processor.info.get_supported_mm_limits() + + mm_config = model_config.get_multimodal_config() + + # Check if all supported modalities have limit == 0 + if all( + mm_config.get_limit_per_prompt(modality) == 0 + for modality in supported_modalities): + logger.info_once( + "All limits of multimodal modalities supported by the model " + "are set to 0, running in text-only mode.") + return False + + return True + def get_max_tokens_per_item_by_modality( self, model_config: "ModelConfig", diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 67ea3b007e..faf5c132f8 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -189,7 +189,7 @@ def compute_encoder_budget( in the input sequence. """ - if not model_config.is_multimodal_model: + if not mm_registry.supports_multimodal_inputs(model_config): return 0, 0 # TODO: handle encoder-decoder models once we support them. diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 78b8fe4ea6..f92a3e43da 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -21,6 +21,7 @@ from vllm.distributed import stateless_destroy_torch_distributed_process_group from vllm.logger import init_logger from vllm.logging_utils.dump_input import dump_engine_exception from vllm.lora.request import LoRARequest +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.tasks import POOLING_TASKS, SupportedTask from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) @@ -125,7 +126,7 @@ class EngineCore: ) self.mm_input_cache_server = MultiModalInputCacheServer( - vllm_config.model_config) + vllm_config.model_config, MULTIMODAL_REGISTRY) # Setup batch queue for pipeline parallelism. # Batch queue for scheduled batches. This enables us to asynchronously diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index 279c9f0007..0532cda03d 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -3,7 +3,7 @@ from collections.abc import Sequence from typing import TYPE_CHECKING, Optional -from vllm.multimodal import MultiModalKwargs +from vllm.multimodal import MultiModalKwargs, MultiModalRegistry from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata from vllm.utils import is_list_of @@ -46,10 +46,11 @@ if TYPE_CHECKING: class MultiModalInputCacheClient: """Used by P0 to check whether multi-modal kwargs are cached in P1.""" - def __init__(self, model_config: "ModelConfig") -> None: + def __init__(self, model_config: "ModelConfig", + mm_registry: MultiModalRegistry) -> None: super().__init__() - self.enabled = model_config.enable_mm_input_cache + self.enabled = mm_registry.enable_mm_input_cache(model_config) self.mm_cache = MultiModalCache.get_lru_cache( model_config.get_mm_input_cache_gb(), MultiModalCacheItemMetadata, @@ -85,10 +86,11 @@ class MultiModalInputCacheClient: class MultiModalInputCacheServer: """Used by P1 to avoid requiring past multi-modal kwargs from P0.""" - def __init__(self, model_config: "ModelConfig") -> None: + def __init__(self, model_config: "ModelConfig", + mm_registry: MultiModalRegistry) -> None: super().__init__() - self.enabled = model_config.enable_mm_input_cache + self.enabled = mm_registry.enable_mm_input_cache(model_config) self.mm_cache = MultiModalCache.get_lru_cache( model_config.get_mm_input_cache_gb(), MultiModalKwargs, diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 6e37ebeb87..b9419142ca 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -51,7 +51,7 @@ class Processor: mm_registry) self.mm_input_cache_client = MultiModalInputCacheClient( - self.model_config) + self.model_config, mm_registry) @property def mm_registry(self): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 08b253dcdb..48ff50fd6b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -129,7 +129,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ cache_config.cache_dtype] - self.is_multimodal_model = model_config.is_multimodal_model self.is_pooling_model = model_config.pooler_config is not None self.is_encoder_only_model = False self.is_multimodal_raw_input_supported = ( @@ -149,6 +148,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Multi-modal data support self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope + self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( + model_config) # Sampler self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode) @@ -330,7 +331,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.mm_registry, max_model_len=self.max_model_len, max_num_reqs=self.max_num_reqs, - ) if self.is_multimodal_model else None) + ) if self.supports_mm_inputs \ + else None) self.reorder_batch_threshold: Optional[int] = None @@ -1479,14 +1481,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # _prepare_inputs may reorder the batch, so we must gather multi # modal outputs after that to ensure the correct order - if self.is_multimodal_model: + if self.supports_mm_inputs: # Run the multimodal encoder if any. self._execute_mm_encoder(scheduler_output) mm_embeds = self._gather_mm_embeddings(scheduler_output) else: mm_embeds = [] - if self.is_multimodal_model and get_pp_group().is_first_rank: + if self.supports_mm_inputs and get_pp_group().is_first_rank: # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. @@ -1817,7 +1819,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): else: target_hidden_states = hidden_states[token_indices] mm_embeds = None - if self.is_multimodal_model: + if self.supports_mm_inputs: mm_embeds = self._gather_mm_embeddings(scheduler_output, shift_computed_tokens=1) @@ -2209,7 +2211,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): with self.maybe_dummy_run_with_lora(self.lora_config, num_scheduled_tokens): - if self.is_multimodal_model: + if self.supports_mm_inputs: input_ids = None inputs_embeds = self.inputs_embeds[:num_tokens] model_mm_kwargs = self._dummy_mm_kwargs(num_reqs) @@ -2417,7 +2419,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def profile_run(self) -> None: # Profile with multimodal encoder & encoder cache. - if self.is_multimodal_model: + if self.supports_mm_inputs: mm_budget = self.mm_budget assert mm_budget is not None diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 81252f9b60..442c0ea068 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -157,7 +157,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): cache_config.cache_dtype] self._hidden_states_dtype = self.dtype - self.is_multimodal_model = model_config.is_multimodal_model self.sliding_window = model_config.get_sliding_window() self.block_size = cache_config.block_size self.max_model_len = model_config.max_model_len @@ -193,6 +192,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Multi-modal data support self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope + self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( + model_config) # TODO: Support M-RoPE (e.g, Qwen2-VL) assert not self.uses_mrope, "TPU does not support M-RoPE yet." @@ -293,7 +294,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.mm_registry, max_model_len=self.max_model_len, max_num_reqs=self.max_num_reqs, - ) if self.is_multimodal_model else None) + ) if self.supports_mm_inputs else None) if not self.use_spmd: self.sample_from_logits_func = torch.compile( @@ -947,7 +948,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def _get_model_inputs(self, input_ids: torch.Tensor, mm_embeds: list[torch.Tensor]): - if self.is_multimodal_model: + if self.supports_mm_inputs: # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. @@ -979,7 +980,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return self.kv_connector_no_forward(scheduler_output, self.vllm_config) - if self.is_multimodal_model: + if self.supports_mm_inputs: # Run the multimodal encoder if any. self._execute_mm_encoder(scheduler_output) mm_embeds = self._gather_mm_embeddings(scheduler_output) @@ -1230,7 +1231,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): @torch.no_grad() def _dummy_run(self, num_tokens: int, num_reqs: int, num_blocks: int) -> None: - if self.is_multimodal_model: + if self.supports_mm_inputs: input_ids = None inputs_embeds = torch.zeros((num_tokens, self.hidden_size), dtype=self.dtype, @@ -1271,7 +1272,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): _num_slices_per_kv_cache_update_block, ) - if self.is_multimodal_model: + if self.supports_mm_inputs: torch._dynamo.mark_dynamic(inputs_embeds, 0) else: torch._dynamo.mark_dynamic(input_ids, 0) @@ -1305,7 +1306,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): xm.mark_step() # Captures metadata updates def _precompile_mm_encoder(self) -> None: - if not self.is_multimodal_model: + if not self.supports_mm_inputs: return # Pre-compile MM encoder for all supported data modalities. @@ -1527,7 +1528,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_tokens: int, ) -> None: # Profile with multimodal encoder & encoder cache. - if self.is_multimodal_model: + if self.supports_mm_inputs: mm_budget = self.mm_budget assert mm_budget is not None @@ -1684,7 +1685,11 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): get_kv_transfer_group().set_host_xfer_buffer_ops(copy_kv_blocks) def reset_dynamo_cache(self): - if self.is_multimodal_model: + + # NOTE: We check `is_multimodal_model` instead of `supports_mm_inputs` + # since the compiled model object of the language backbone of a + # multimodal model needs to be extracted via `get_language_model`. + if self.model_config.is_multimodal_model: compiled_model = self.model.get_language_model().model else: compiled_model = self.model.model From 23472ff51cdf25c2f9c9bf9afa50a8d3cc6cc1d8 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 8 Aug 2025 23:04:19 -0700 Subject: [PATCH 109/932] [Doc] Add usage of implicit text-only mode (#22561) Signed-off-by: Roger Wang Co-authored-by: Flora Feng <4florafeng@gmail.com> --- docs/models/supported_models.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index b79650444a..afabfccb55 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -583,6 +583,9 @@ See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inp **This is no longer required if you are using vLLM V1.** +!!! tip + For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache. + !!! note vLLM currently only supports adding LoRA to the language backbone of multimodal models. From 8a0ffd6285f6a0d8137d9363f448cef78ce97712 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Sat, 9 Aug 2025 08:05:32 +0200 Subject: [PATCH 110/932] Remove mamba_ssm from vLLM requirements; install inside test container using `--no-build-isolation` (#22541) Signed-off-by: Thomas Parnell --- .buildkite/test-pipeline.yaml | 8 ++++---- docs/contributing/ci/update_pytorch_version.md | 13 ------------- requirements/test.in | 5 ++--- requirements/test.txt | 13 +------------ tests/models/language/generation/test_hybrid.py | 16 +++++++++------- tests/models/registry.py | 16 ++++++++++------ 6 files changed, 26 insertions(+), 45 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index e139c6b305..221888edb3 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -535,8 +535,6 @@ steps: - vllm/ - tests/models/language commands: - # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - pip freeze | grep -E 'torch' - pytest -v -s models/language -m core_model @@ -547,8 +545,10 @@ steps: - vllm/ - tests/models/language/generation commands: - # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - pytest -v -s models/language/generation -m hybrid_model - label: Language Models Test (Extended Generation) # 1hr20min diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md index 3a6026d450..7ef22d6f8c 100644 --- a/docs/contributing/ci/update_pytorch_version.md +++ b/docs/contributing/ci/update_pytorch_version.md @@ -131,19 +131,6 @@ MAX_JOBS=16 uv pip install --system \ --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30" ``` -### Mamba - -```bash -uv pip install --system \ - --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5" -``` - -### causal-conv1d - -```bash -uv pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' -``` - ## Update all the different vLLM platforms Rather than attempting to update all vLLM platforms in a single pull request, it's more manageable diff --git a/requirements/test.in b/requirements/test.in index 1e0cab80a2..ca22fd1551 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -10,7 +10,7 @@ pytest-timeout # testing utils backoff # required for phi4mm test blobfile # required for kimi-vl test -einops # required for MPT, qwen-vl and Mamba +einops # required for MPT, qwen-vl httpx librosa # required for audio tests vector_quantize_pytorch # required for minicpmo_26 test @@ -26,7 +26,6 @@ torch==2.7.1 torchaudio==2.7.1 torchvision==0.22.1 transformers_stream_generator # required for qwen-vl test -mamba_ssm==2.2.5 # required for plamo2 test matplotlib # required for qwen-vl test mistral_common[image,audio] >= 1.8.2 # required for voxtral test num2words # required for smolvlm test @@ -53,4 +52,4 @@ runai-model-streamer==0.11.0 runai-model-streamer-s3==0.11.0 fastsafetensors>=0.1.10 pydantic>=2.10 # 2.9 leads to error on python 3.10 -terratorch==1.1rc2 # required for PrithviMAE test \ No newline at end of file +terratorch==1.1rc2 # required for PrithviMAE test diff --git a/requirements/test.txt b/requirements/test.txt index 324f8153b2..377eeb58c4 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -178,7 +178,6 @@ einops==0.8.1 # via # -r requirements/test.in # encodec - # mamba-ssm # terratorch # torchgeo # vector-quantize-pytorch @@ -417,8 +416,6 @@ lxml==5.3.0 # sacrebleu mako==1.3.10 # via alembic -mamba-ssm==2.2.5 - # via -r requirements/test.in markdown==3.8.2 # via mlflow markdown-it-py==3.0.0 @@ -475,8 +472,6 @@ networkx==3.2.1 # via # scikit-image # torch -ninja==1.11.1.3 - # via mamba-ssm nltk==3.9.1 # via rouge-score num2words==0.5.14 @@ -629,7 +624,6 @@ packaging==24.2 # lazy-loader # lightning # lightning-utilities - # mamba-ssm # matplotlib # mlflow-skinny # peft @@ -973,7 +967,6 @@ sentencepiece==0.2.0 setuptools==77.0.3 # via # lightning-utilities - # mamba-ssm # pytablewriter # torch # triton @@ -1085,7 +1078,6 @@ torch==2.7.1+cu128 # lightly # lightning # lm-eval - # mamba-ssm # mteb # open-clip-torch # peft @@ -1152,16 +1144,13 @@ transformers==4.55.0 # -r requirements/test.in # genai-perf # lm-eval - # mamba-ssm # peft # sentence-transformers # transformers-stream-generator transformers-stream-generator==0.0.5 # via -r requirements/test.in triton==3.3.1 - # via - # mamba-ssm - # torch + # via torch tritonclient==2.51.0 # via # -r requirements/test.in diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 67ba2f2559..8c3e1f5c2b 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -25,10 +25,8 @@ SSM_MODELS = [ HYBRID_MODELS = [ "ai21labs/Jamba-tiny-dev", - # NOTE: Running Plamo2 in transformers implementation requires to install - # causal-conv1d package, which is not listed as a test dependency as it's - # not compatible with pip-compile. - "pfnet/plamo-2-1b", + # skipping until vLLM implementation issues are resolved + # "pfnet/plamo-2-1b", "Zyphra/Zamba2-1.2B-instruct", "hmellor/tiny-random-BambaForCausalLM", "ibm-ai-platform/Bamba-9B-v1", @@ -83,12 +81,16 @@ def test_models( try: model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - model_info.check_transformers_version(on_fail="skip") + hf_version_check = model_info.check_transformers_version( + on_fail="return") except ValueError: - pass + hf_version_check = None + + if hf_version_check is not None: + print(f"Skipping transformers comparison because: {hf_version_check}") with hf_runner(model) as hf_model: - if model not in HF_UNSUPPORTED_MODELS: + if model not in HF_UNSUPPORTED_MODELS and hf_version_check is None: hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) else: diff --git a/tests/models/registry.py b/tests/models/registry.py index b1952ce9c2..2bb06b7d19 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -79,17 +79,17 @@ class _HfExamplesInfo: def check_transformers_version( self, *, - on_fail: Literal["error", "skip"], + on_fail: Literal["error", "skip", "return"], check_min_version: bool = True, check_max_version: bool = True, - ) -> None: + ) -> Optional[str]: """ If the installed transformers version does not meet the requirements, perform the given action. """ if (self.min_transformers_version is None and self.max_transformers_version is None): - return + return None current_version = TRANSFORMERS_VERSION cur_base_version = Version(current_version).base_version @@ -105,16 +105,18 @@ class _HfExamplesInfo: and Version(cur_base_version) > Version(max_version)): msg += f"<={max_version}` is required to run this model." else: - return + return None if self.transformers_version_reason: msg += f" Reason: {self.transformers_version_reason}" if on_fail == "error": raise RuntimeError(msg) - else: + elif on_fail == "skip": pytest.skip(msg) + return msg + def check_available_online( self, *, @@ -148,7 +150,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5", trust_remote_code=True), - "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B", + "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B-v1", + min_transformers_version="4.55.1", extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}), # noqa: E501 "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m", {"1b": "bigscience/bloomz-1b1"}), @@ -223,6 +226,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"), "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini", + min_transformers_version="4.55.1", extras={ "tiny": "ai21labs/Jamba-tiny-dev", "random": "ai21labs/Jamba-tiny-random", # noqa: E501 From 3157aebb63a2e121da6de943754dc95dffd14caa Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Sat, 9 Aug 2025 02:07:48 -0400 Subject: [PATCH 111/932] [Log] Add Warning for Deprecation of DeepGEMM old version (#22194) Signed-off-by: yewentao256 --- vllm/utils/deep_gemm.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 0edfb01cde..174287b44b 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -14,6 +14,7 @@ from typing import Any, Callable, NoReturn import torch import vllm.envs as envs +from vllm.logger import logger from vllm.platforms import current_platform from vllm.utils import cdiv, has_deep_gemm @@ -57,6 +58,14 @@ def _resolve_symbol(module, new: str, old: str) -> Callable[..., Any] | None: if hasattr(module, new): return getattr(module, new) if hasattr(module, old): + # TODO(wentao): deprecate old symbol in the future. + logger.warning_once( + "Found legacy DeepGEMM symbol `%s`. Please upgrade the `deep_gemm` " + "package so that `%s` is available. Support for the legacy symbol " + "will be removed in a future vLLM release.", + old, + new, + ) return getattr(module, old) return None From 6ade99eafa373f5c88eb6b8956daa4c217aa7cda Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Sat, 9 Aug 2025 08:08:48 +0200 Subject: [PATCH 112/932] [V1] [Hybrid] Support Minimax-Text-01 in V1 (#22151) Signed-off-by: Thomas Parnell --- vllm/model_executor/layers/lightning_attn.py | 2 +- .../layers/mamba/mamba_utils.py | 11 + vllm/model_executor/models/minimax_text_01.py | 192 ++++++++++++++---- vllm/v1/attention/backends/linear_attn.py | 67 ++++++ vllm/v1/attention/backends/mamba_selectors.py | 4 +- 5 files changed, 234 insertions(+), 42 deletions(-) create mode 100644 vllm/v1/attention/backends/linear_attn.py diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py index 978086d190..8ffc700ca5 100644 --- a/vllm/model_executor/layers/lightning_attn.py +++ b/vllm/model_executor/layers/lightning_attn.py @@ -532,7 +532,7 @@ def _linear_attn_decode_kernel( pid_d = tl.program_id(2) # dimension block index # Load slot index for the current batch - slot_id = tl.load(slot_idx + pid_b) + slot_id = tl.load(slot_idx + pid_b).to(tl.int64) # Skip if slot_id is -1 (padding) if slot_id == -1: diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py index 42c815b08f..ad14017912 100644 --- a/vllm/model_executor/layers/mamba/mamba_utils.py +++ b/vllm/model_executor/layers/mamba/mamba_utils.py @@ -5,6 +5,17 @@ from vllm.distributed import divide class MambaStateShapeCalculator: + @classmethod + def linear_attention_state_shape( + cls, + num_heads: int, + tp_size: int, + head_dim: int, + ) -> tuple[tuple[int, int, int], ...]: + + state_shape = (num_heads // tp_size, head_dim, head_dim) + return (state_shape, ) + @classmethod def mamba1_state_shape( cls, diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index f2773af490..1f9f7f60ca 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -14,8 +14,9 @@ from einops import rearrange from torch import nn from transformers.configuration_utils import PretrainedConfig +from vllm import envs from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed.communication_op import tensor_model_parallel_all_reduce from vllm.distributed.parallel_state import ( get_pp_group, get_tensor_model_parallel_rank, @@ -33,6 +34,9 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.abstract import MambaBase +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateShapeCalculator) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -41,8 +45,9 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.utils import maybe_prefix from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata -from .interfaces import HasInnerState, IsHybrid, SupportsV0Only +from .interfaces import HasInnerState, IsHybrid from .minimax_cache import MinimaxCacheManager, MinimaxCacheParams from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers @@ -327,7 +332,17 @@ class MiniMaxText01LinearKernel: return rearrange(output.squeeze(0), "h n d -> n (h d)") -class MiniMaxText01LinearAttention(nn.Module): +class MiniMaxText01LinearAttention(nn.Module, MambaBase): + + @property + def mamba_type(self) -> str: + return "linear_attention" + + def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: + return MambaStateShapeCalculator.linear_attention_state_shape( + num_heads=self.num_heads, + tp_size=self.tp_size, + head_dim=self.head_dim) def __init__( self, @@ -359,6 +374,7 @@ class MiniMaxText01LinearAttention(nn.Module): self.tp_heads = self.total_num_heads // self.tp_size self.qkv_size = self.num_heads * self.head_dim self.tp_hidden = self.head_dim * self.tp_heads + self.prefix = prefix self.qkv_proj = ColumnParallelLinear( hidden_size, @@ -397,6 +413,12 @@ class MiniMaxText01LinearAttention(nn.Module): self.tp_heads:(self.tp_rank + 1) * self.tp_heads].contiguous() + if envs.VLLM_USE_V1: + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + @staticmethod def weight_direct_load(param: torch.Tensor, loaded_weight: torch.Tensor) -> None: @@ -434,13 +456,14 @@ class MiniMaxText01LinearAttention(nn.Module): break if _prefill_idx >= len(state_indices_tensor): break - _start = attn_metadata.query_start_loc[_prefill_idx] - _end = attn_metadata.query_start_loc[_prefill_idx + 1] - slot_id = state_indices_tensor[_prefill_idx] + # prefills are packed at end of batch in V1 + offset = attn_metadata.num_decode_tokens if envs.VLLM_USE_V1 else 0 + _start = attn_metadata.query_start_loc[offset + _prefill_idx] + _end = attn_metadata.query_start_loc[offset + _prefill_idx + 1] + slot_id = state_indices_tensor[offset + _prefill_idx] qs = q[_start:_end].transpose(0, 1).contiguous() ks = k[_start:_end].transpose(0, 1).contiguous() vs = v[_start:_end].transpose(0, 1).contiguous() - slot_id = state_indices_tensor[_prefill_idx] slice_layer_cache = kv_cache[slot_id, ...] out_slice = MiniMaxText01LinearKernel.jit_linear_forward_prefix( @@ -453,9 +476,13 @@ class MiniMaxText01LinearAttention(nn.Module): layer_idx=self.layer_idx) hidden.append(out_slice.contiguous()) if attn_metadata.num_decode_tokens > 0: - hidden.append( - self._decode_infer(q, k, v, kv_cache, state_indices_tensor, - attn_metadata)) + hidden_decode = self._decode_infer(q, k, v, kv_cache, + state_indices_tensor, + attn_metadata) + if envs.VLLM_USE_V1: + hidden.insert(0, hidden_decode) + else: + hidden.append(hidden_decode) if not hidden: return torch.empty((0, q.size(-1)), device=q.device, dtype=q.dtype) @@ -465,11 +492,17 @@ class MiniMaxText01LinearAttention(nn.Module): def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor, attn_metadata): - q = q[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous() - k = k[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous() - v = v[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous() - slot_id = state_indices_tensor[getattr(attn_metadata, "num_prefills", 0 - ):] + if not envs.VLLM_USE_V1: + q = q[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous() + k = k[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous() + v = v[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous() + num_prefills = getattr(attn_metadata, "num_prefills", 0) + slot_id = state_indices_tensor[num_prefills:] + else: + q = q[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() + k = k[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() + v = v[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() + slot_id = state_indices_tensor[:attn_metadata.num_decodes] hidden = linear_decode_forward_triton(q, k, v, kv_cache, self.tp_slope, slot_id, 32) return hidden @@ -483,17 +516,49 @@ class MiniMaxText01LinearAttention(nn.Module): q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1) forward_context = get_forward_context() attn_metadata = forward_context.attn_metadata - kv_cache = kv_caches.minimax_cache - state_indices_tensor = kv_caches.state_indices_tensor + if envs.VLLM_USE_V1: + if attn_metadata is not None: + assert isinstance(attn_metadata, dict) + attn_metadata = attn_metadata[self.prefix] + assert isinstance(attn_metadata, LinearAttentionMetadata) + kv_cache = self.kv_cache[forward_context.virtual_engine][0] + state_indices_tensor = attn_metadata.state_indices_tensor + + num_prefills = getattr(attn_metadata, "num_prefills", 0) + if num_prefills > 0: + num_decode_tokens = getattr(attn_metadata, + "num_decode_tokens", 0) + for prefill_idx in range(num_prefills): + q_start = attn_metadata.query_start_loc[ + num_decode_tokens + prefill_idx] + q_end = attn_metadata.query_start_loc[num_decode_tokens + + prefill_idx + + 1] + query_len = q_end - q_start + context_len = attn_metadata.seq_lens[ + num_decode_tokens + prefill_idx] - query_len + if context_len == 0: + block_to_clear = state_indices_tensor[ + num_decode_tokens + prefill_idx] + kv_cache[block_to_clear, ...] = 0 + else: + kv_cache = kv_caches.minimax_cache + state_indices_tensor = kv_caches.state_indices_tensor decode_only = getattr(attn_metadata, "num_prefills", 0) == 0 - if not decode_only: - hidden = self._prefill_and_mix_infer(q, k, v, kv_cache, - state_indices_tensor, - attn_metadata) + if attn_metadata is None: + hidden = torch.empty((q.shape[0], q.shape[1] * q.shape[2]), + device=q.device, + dtype=q.dtype) else: - hidden = self._decode_infer(q, k, v, kv_cache, - state_indices_tensor, attn_metadata) + if not decode_only: + hidden = self._prefill_and_mix_infer(q, k, v, kv_cache, + state_indices_tensor, + attn_metadata) + else: + hidden = self._decode_infer(q, k, v, kv_cache, + state_indices_tensor, + attn_metadata) hidden = self.norm._forward(hidden) gate, _ = self.output_gate(hidden_states) @@ -541,6 +606,7 @@ class MiniMaxText01Attention(nn.Module): self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta self.sliding_window = sliding_window + self.prefix = prefix self.qkv_proj = QKVParallelLinear( hidden_size, @@ -575,7 +641,12 @@ class MiniMaxText01Attention(nn.Module): attn_metadata = forward_context.attn_metadata qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = attn_metadata.rotary_emb(positions, q, k) + if envs.VLLM_USE_V1: + if attn_metadata is not None: + q, k = attn_metadata[f"{self.prefix}.attn"].rotary_emb( + positions, q, k) + else: + q, k = attn_metadata.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v) output, _ = self.o_proj(attn_output) return output @@ -595,6 +666,7 @@ class MiniMaxText01DecoderLayer(nn.Module): ) -> None: self._ilayer = layer_id self._irank = get_tensor_model_parallel_rank() + self.prefix = prefix super().__init__() self.hidden_size = config.hidden_size @@ -876,8 +948,9 @@ class MiniMaxText01Model(nn.Module): self._dtype = _dummy.dtype del _dummy - self.minimax_cache = MinimaxCacheManager(dtype=torch.float32, - cache_shape=self.cache_shape) + if not envs.VLLM_USE_V1: + self.minimax_cache = MinimaxCacheManager( + dtype=torch.float32, cache_shape=self.cache_shape) rope_theta = getattr(config, "rope_theta", 10000) head_dim = getattr(config, "head_dim", None) @@ -944,23 +1017,27 @@ class MiniMaxText01Model(nn.Module): **kwargs) -> Union[torch.Tensor, IntermediateTensors]: forward_context = get_forward_context() attn_metadata = forward_context.attn_metadata - if attn_metadata is None: + if not envs.VLLM_USE_V1 and attn_metadata is None: return None if "request_ids_to_seq_ids" not in kwargs: kwargs["request_ids_to_seq_ids"] = {} if "finished_requests_ids" not in kwargs: kwargs["finished_requests_ids"] = [] - ( - minimax_cache_tensors, - state_indices_tensor, - ) = self.minimax_cache.current_run_tensors(**kwargs) - if getattr(attn_metadata, "num_prefills", 0) > 0: - self._clear_prefill_cache(attn_metadata, minimax_cache_tensors, - **kwargs) + if not envs.VLLM_USE_V1: + ( + minimax_cache_tensors, + state_indices_tensor, + ) = self.minimax_cache.current_run_tensors(**kwargs) + if getattr(attn_metadata, "num_prefills", 0) > 0: + self._clear_prefill_cache(attn_metadata, minimax_cache_tensors, + **kwargs) + + minimax_cache_params = MinimaxCacheParams(minimax_cache_tensors, + state_indices_tensor) + else: + minimax_cache_params = None - minimax_cache_params = MinimaxCacheParams(minimax_cache_tensors, - state_indices_tensor) if get_pp_group().is_first_rank: if inputs_embeds is None: hidden_states = self.embed_scale * self.embed_tokens(input_ids) @@ -973,11 +1050,22 @@ class MiniMaxText01Model(nn.Module): residual = intermediate_tensors["residual"] minimax_cache_index = 0 - attn_metadata.rotary_emb = self.rotary_emb + for i in range(self.start_layer, self.end_layer): layer = self.layers[i] + if attn_metadata is not None: + # TODO (tdoublep): this whole thing with the rotary_emb is + # weird. we shouldn't be passing it via attn_metadata imo. + if envs.VLLM_USE_V1: + if isinstance(layer.self_attn, MiniMaxText01Attention): + attn_metadata[layer.prefix + + ".attn"].rotary_emb = self.rotary_emb + else: + attn_metadata.rotary_emb = self.rotary_emb + _caches = None - if isinstance(layer.self_attn, MiniMaxText01LinearAttention): + if not envs.VLLM_USE_V1 and isinstance( + layer.self_attn, MiniMaxText01LinearAttention): current_state_layer = minimax_cache_index _caches = minimax_cache_params.at_layer_idx( current_state_layer) @@ -1002,8 +1090,7 @@ class MiniMaxText01Model(nn.Module): return hidden_states -class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid, - SupportsV0Only): +class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid): def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: @@ -1321,3 +1408,28 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid, load_basic_weight(name, loaded_weight, self) return loaded_params + + @classmethod + def get_mamba_state_shape_from_config( + cls, + vllm_config: "VllmConfig", + use_v1: bool = True, + ) -> tuple[tuple[int, ...], ...]: + """Calculate shape for MiniMaxText01LinearAttention cache. + + Args: + vllm_config: vLLM config + use_v1: Get shapes for V1 (or V0) + + Returns: + Tuple containing: + - state_shape: Shape of the cache + """ + parallel_config = vllm_config.parallel_config + hf_config = vllm_config.model_config.hf_config + + return MambaStateShapeCalculator.linear_attention_state_shape( + num_heads=hf_config.num_attention_heads, + tp_size=parallel_config.tensor_parallel_size, + head_dim=hf_config.head_dim, + ) diff --git a/vllm/v1/attention/backends/linear_attn.py b/vllm/v1/attention/backends/linear_attn.py new file mode 100644 index 0000000000..f08b6d7f17 --- /dev/null +++ b/vllm/v1/attention/backends/linear_attn.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass +from typing import ClassVar + +import torch + +from vllm.attention.backends.abstract import AttentionBackend +from vllm.config import VllmConfig +from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, + CommonAttentionMetadata, + split_decodes_and_prefills) +from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec + + +class LinearAttentionBackend(AttentionBackend): + + @staticmethod + def get_builder_cls() -> type["LinearAttentionMetadataBuilder"]: + return LinearAttentionMetadataBuilder + + +@dataclass +class LinearAttentionMetadata: + num_prefills: int + num_prefill_tokens: int + num_decodes: int + num_decode_tokens: int + query_start_loc: torch.Tensor + seq_lens: torch.Tensor + + state_indices_tensor: torch.Tensor # shape: [batch,] + + +class LinearAttentionMetadataBuilder( + AttentionMetadataBuilder[LinearAttentionMetadata]): + + reorder_batch_threshold: ClassVar[int] = 1 + + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device): + assert isinstance(kv_cache_spec, MambaSpec) + self.kv_cache_spec = kv_cache_spec + + def build(self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False) -> LinearAttentionMetadata: + query_start_loc = common_attn_metadata.query_start_loc + seq_lens = common_attn_metadata.seq_lens + + state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] + + num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( + split_decodes_and_prefills(common_attn_metadata, + decode_threshold=1)) + + attn_metadata = LinearAttentionMetadata( + num_prefills=num_prefills, + num_prefill_tokens=num_prefill_tokens, + num_decodes=num_decodes, + num_decode_tokens=num_decode_tokens, + query_start_loc=query_start_loc, + seq_lens=seq_lens, + state_indices_tensor=state_indices_tensor, + ) + return attn_metadata diff --git a/vllm/v1/attention/backends/mamba_selectors.py b/vllm/v1/attention/backends/mamba_selectors.py index f56f2fb7bf..852e0dfe1b 100644 --- a/vllm/v1/attention/backends/mamba_selectors.py +++ b/vllm/v1/attention/backends/mamba_selectors.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.attention.backends.abstract import AttentionBackend +from vllm.v1.attention.backends.linear_attn import LinearAttentionBackend from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend @@ -8,9 +9,10 @@ from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend def get_mamba_attn_backend(mamba_type: str) -> type[AttentionBackend]: if mamba_type == "mamba1": return Mamba1AttentionBackend - if mamba_type == "mamba2": return Mamba2AttentionBackend + if mamba_type == "linear_attention": + return LinearAttentionBackend raise NotImplementedError(f"Mamba Attention type {mamba_type} is not " "supported yet.") From 7ad7adb67f1350b6e9f7cfdd7aacf38eed093bb1 Mon Sep 17 00:00:00 2001 From: Or Ozeri Date: Sat, 9 Aug 2025 09:09:51 +0300 Subject: [PATCH 113/932] v1: Pass KVConnectorOutput to scheduler-side (#22157) Signed-off-by: Or Ozeri --- .../distributed/kv_transfer/kv_connector/v1/base.py | 13 +++++++++++++ .../kv_transfer/kv_connector/v1/multi_connector.py | 5 +++++ vllm/v1/core/sched/scheduler.py | 4 ++++ 3 files changed, 22 insertions(+) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 7a2ccb5865..b721043978 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -12,6 +12,8 @@ The class provides the following primitives: times for a given request and should be side-effect free. update_state_after_alloc() - update KVConnector state after temporary buffer alloc by the CacheManager. + update_connector_output() - update KVConnector state after + output is received from worker-side connectors. request_finished() - called when a request is finished, with the computed kv cache blocks for the request. Returns whether KV cache should be freed now or will be @@ -38,6 +40,7 @@ import torch from vllm.logger import init_logger from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.outputs import KVConnectorOutput if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata @@ -283,6 +286,16 @@ class KVConnectorBase_V1(ABC): """ pass + def update_connector_output(self, connector_output: KVConnectorOutput): + """ + Update KVConnector state from worker-side connectors output. + + Args: + connector_output (KVConnectorOutput): the worker-side + connectors output. + """ + return + def request_finished( self, request: "Request", diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index 62a4980bff..7d67c76e2f 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -14,6 +14,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( from vllm.logger import init_logger from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.outputs import KVConnectorOutput if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata @@ -177,6 +178,10 @@ class MultiConnector(KVConnectorBase_V1): self._extra_async_saves = {} return metadata + def update_connector_output(self, connector_output: KVConnectorOutput): + for c in self._connectors: + c.update_connector_output(connector_output) + def request_finished( self, request: "Request", diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 430085d9c9..85fc1a4a01 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1150,6 +1150,10 @@ class Scheduler(SchedulerInterface): # if finished_recving: add to state so we can scheduler the request during the next step. """ + + assert self.connector is not None + self.connector.update_connector_output(kv_connector_output) + # KV Connector:: update recv and send status from last step. for req_id in (kv_connector_output.finished_recving or ()): logger.debug("Finished recving KV transfer for request %s", req_id) From 65552b476b1c475ef433995d2699bb27428693b3 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 9 Aug 2025 14:10:51 +0800 Subject: [PATCH 114/932] [Misc] Use config definitions from Transformers library (#21913) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/aimv2.py | 22 +++++++++---------- vllm/model_executor/models/commandr.py | 8 +++---- vllm/model_executor/models/dbrx.py | 14 ++++++------ vllm/model_executor/models/deepseek_v2.py | 15 ++++++++----- vllm/model_executor/models/dots1.py | 8 +++---- vllm/model_executor/models/exaone4.py | 6 ++--- vllm/model_executor/models/glm4_moe.py | 10 ++++----- vllm/model_executor/models/minimax_text_01.py | 6 ++--- vllm/model_executor/models/olmoe.py | 4 ++-- vllm/model_executor/models/qwen2_moe.py | 6 ++--- vllm/model_executor/models/qwen3_moe.py | 6 ++--- 11 files changed, 54 insertions(+), 51 deletions(-) diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py index d2307bb464..b13d863ebb 100644 --- a/vllm/model_executor/models/aimv2.py +++ b/vllm/model_executor/models/aimv2.py @@ -8,7 +8,6 @@ from typing import Optional import torch import torch.nn as nn -from transformers import PretrainedConfig from vllm.attention.layer import MultiHeadAttention from vllm.distributed import get_tensor_model_parallel_world_size @@ -21,12 +20,13 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.transformers_utils.configs.ovis import AIMv2Config class AIMv2SwiGLUFFN(nn.Module): - def __init__(self, config: PretrainedConfig, - quant_config: QuantizationConfig, prefix: str): + def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, + prefix: str): super().__init__() hidden_features = config.intermediate_size in_features = config.hidden_size @@ -57,7 +57,7 @@ class AIMv2SwiGLUFFN(nn.Module): class AIMv2PatchEmbed(nn.Module): - def __init__(self, config: PretrainedConfig): + def __init__(self, config: AIMv2Config): super().__init__() self.proj = nn.Conv2d( config.num_channels, @@ -75,7 +75,7 @@ class AIMv2PatchEmbed(nn.Module): class AIMv2ViTPreprocessor(nn.Module): - def __init__(self, config: PretrainedConfig): + def __init__(self, config: AIMv2Config): super().__init__() num_patches = (config.image_size // config.patch_size)**2 @@ -93,8 +93,8 @@ class AIMv2ViTPreprocessor(nn.Module): class AIMv2Attention(nn.Module): - def __init__(self, config: PretrainedConfig, - quant_config: QuantizationConfig, prefix: str): + def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, + prefix: str): super().__init__() self.config = config self.embed_dim = config.hidden_size @@ -141,8 +141,8 @@ class AIMv2Attention(nn.Module): class AIMv2Block(nn.Module): - def __init__(self, config: PretrainedConfig, - quant_config: QuantizationConfig, prefix: str): + def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, + prefix: str): super().__init__() self.attn = AIMv2Attention(config, quant_config=quant_config, @@ -163,7 +163,7 @@ class AIMv2Transformer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: AIMv2Config, quant_config: QuantizationConfig, *, require_post_norm: Optional[bool] = None, @@ -193,7 +193,7 @@ class AIMv2Transformer(nn.Module): class AIMv2Model(torch.nn.Module): def __init__(self, - config: PretrainedConfig, + config: AIMv2Config, quant_config: QuantizationConfig, *, require_post_norm: Optional[bool] = None, diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index c4f6144ed9..69281abf73 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -27,7 +27,7 @@ from typing import Optional, Union import torch from torch import nn -from transformers import CohereConfig +from transformers import Cohere2Config, CohereConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -89,7 +89,7 @@ class CohereMLP(nn.Module): def __init__( self, - config: CohereConfig, + config: Union[CohereConfig, Cohere2Config], quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): @@ -124,7 +124,7 @@ class CohereAttention(nn.Module): def __init__( self, - config: CohereConfig, + config: Union[CohereConfig, Cohere2Config], cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -242,7 +242,7 @@ class CohereAttention(nn.Module): class CohereDecoderLayer(nn.Module): def __init__(self, - config: CohereConfig, + config: Union[CohereConfig, Cohere2Config], cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = ""): diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 360c7e66bf..e74d90e0b1 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -6,7 +6,7 @@ from typing import Optional, Union import torch import torch.nn as nn -from transformers import PretrainedConfig +from transformers import DbrxConfig from vllm.attention import Attention from vllm.config import CacheConfig, VllmConfig @@ -39,7 +39,7 @@ class DbrxRouter(nn.Module): def __init__( self, - config: PretrainedConfig, + config: DbrxConfig, params_dtype: Optional[torch.dtype] = None, ): super().__init__() @@ -63,7 +63,7 @@ class DbrxExperts(FusedMoE): def __init__( self, - config: PretrainedConfig, + config: DbrxConfig, quant_config: Optional[QuantizationConfig] = None, params_dtype: Optional[torch.dtype] = None, prefix: str = "", @@ -138,7 +138,7 @@ class DbrxMoE(nn.Module): def __init__( self, - config: PretrainedConfig, + config: DbrxConfig, quant_config: Optional[QuantizationConfig] = None, params_dtype: Optional[torch.dtype] = None, prefix: str = "", @@ -169,7 +169,7 @@ class DbrxAttention(nn.Module): def __init__( self, - config: PretrainedConfig, + config: DbrxConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -249,7 +249,7 @@ class DbrxFusedNormAttention(nn.Module): def __init__( self, - config: PretrainedConfig, + config: DbrxConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -284,7 +284,7 @@ class DbrxBlock(nn.Module): def __init__( self, - config: PretrainedConfig, + config: DbrxConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index c2880c33cb..f199da135e 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -29,7 +29,7 @@ from typing import Any, Optional, Union import torch from torch import nn -from transformers import PretrainedConfig +from transformers import DeepseekV2Config, DeepseekV3Config from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -100,7 +100,7 @@ class DeepseekV2MoE(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Union[DeepseekV2Config, DeepseekV3Config], quant_config: Optional[QuantizationConfig] = None, prefix: str = "", enable_eplb: bool = False, @@ -221,7 +221,7 @@ class DeepseekV2Attention(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Union[DeepseekV2Config, DeepseekV3Config], hidden_size: int, num_heads: int, qk_nope_head_dim: int, @@ -373,7 +373,7 @@ class DeepseekV2MLAAttention(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Union[DeepseekV2Config, DeepseekV3Config], hidden_size: int, num_heads: int, qk_nope_head_dim: int, @@ -538,7 +538,7 @@ class DeepseekV2DecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Union[DeepseekV2Config, DeepseekV3Config], prefix: str, model_config: ModelConfig, cache_config: Optional[CacheConfig] = None, @@ -973,7 +973,10 @@ class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM): pass -def get_spec_layer_idx_from_weight_name(config: PretrainedConfig, +# Compatibility with +# https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/configuration_deepseek.py +def get_spec_layer_idx_from_weight_name(config: Union[DeepseekV2Config, + DeepseekV3Config], weight_name: str) -> Optional[int]: if (hasattr(config, "num_nextn_predict_layers") and config.num_nextn_predict_layers > 0): diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index 9b21a79446..5f410c0ae5 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -29,7 +29,7 @@ from typing import Any, Optional, Union import torch from torch import nn -from transformers import PretrainedConfig +from transformers import Dots1Config from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -99,7 +99,7 @@ class Dots1MoE(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Dots1Config, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): @@ -174,7 +174,7 @@ class Dots1Attention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - config: PretrainedConfig, + config: Dots1Config, rope_theta: float = 10000, rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, @@ -260,7 +260,7 @@ class Dots1DecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Dots1Config, prefix: str, model_config: ModelConfig, cache_config: Optional[CacheConfig] = None, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 3d6ce3e889..ecd942a76a 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -26,7 +26,7 @@ from typing import Any, Optional, Union import torch from torch import nn -from transformers import PretrainedConfig +from transformers import Exaone4Config from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -96,7 +96,7 @@ class Exaone4Attention(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Exaone4Config, hidden_size: int, num_heads: int, num_kv_heads: int, @@ -224,7 +224,7 @@ class Exaone4DecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Exaone4Config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 0053e4e6ff..624eef6cf1 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -28,7 +28,7 @@ from typing import Any, Optional, Union import torch from torch import nn -from transformers import PretrainedConfig +from transformers.models.glm4_moe import Glm4MoeConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -100,7 +100,7 @@ class Glm4MoE(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Glm4MoeConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", enable_eplb: bool = False, @@ -198,7 +198,7 @@ class Glm4MoeAttention(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Glm4MoeConfig, hidden_size: int, num_heads: int, num_kv_heads: int, @@ -297,7 +297,7 @@ class Glm4MoeDecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Glm4MoeConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -681,7 +681,7 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): return self.model.get_expert_mapping() -def get_spec_layer_idx_from_weight_name(config: PretrainedConfig, +def get_spec_layer_idx_from_weight_name(config: Glm4MoeConfig, weight_name: str) -> Optional[int]: if hasattr(config, "num_nextn_predict_layers") and (config.num_nextn_predict_layers diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 1f9f7f60ca..3d14a6ad5c 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -12,7 +12,7 @@ import torch.distributed import torch.nn.functional as F from einops import rearrange from torch import nn -from transformers.configuration_utils import PretrainedConfig +from transformers import MiniMaxConfig from vllm import envs from vllm.attention import Attention, AttentionMetadata @@ -656,7 +656,7 @@ class MiniMaxText01DecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: MiniMaxConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, expert_num: int = 1, @@ -860,7 +860,7 @@ class MiniMaxText01Model(nn.Module): def __init__( self, - config: PretrainedConfig, + config: MiniMaxConfig, quant_config: Optional[QuantizationConfig] = None, cache_config: Optional[CacheConfig] = None, scheduler_config=None, diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 7552f64c42..a47c3bd416 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -19,7 +19,7 @@ from typing import Any, Optional, Union import torch from torch import nn -from transformers import PretrainedConfig +from transformers import OlmoeConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -205,7 +205,7 @@ class OlmoeDecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: OlmoeConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index b061e2f69a..5c4ad34246 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -30,7 +30,7 @@ from typing import Any, Optional, Union import torch import torch.nn.functional as F from torch import nn -from transformers import PretrainedConfig +from transformers import Qwen2MoeConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -98,7 +98,7 @@ class Qwen2MoeSparseMoeBlock(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Qwen2MoeConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): @@ -256,7 +256,7 @@ class Qwen2MoeDecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Qwen2MoeConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index b2397c115d..3d1e72299b 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -28,7 +28,7 @@ from typing import Any, Optional, Union import torch from torch import nn -from transformers import PretrainedConfig +from transformers import Qwen3MoeConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -101,7 +101,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Qwen3MoeConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", enable_eplb: bool = False, @@ -278,7 +278,7 @@ class Qwen3MoeDecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Qwen3MoeConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", From 10a02535d4252353880486f6fdf91e5ce7507977 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eldar=20Kurti=C4=87?= <8884008+eldarkurtic@users.noreply.github.com> Date: Sat, 9 Aug 2025 08:12:12 +0200 Subject: [PATCH 115/932] Fix loading of quantized BigCode models (#22463) Signed-off-by: Eldar Kurtic --- vllm/model_executor/models/gpt_bigcode.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 661a67bdc0..036ded530f 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -45,7 +45,8 @@ from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class GPTBigCodeAttention(nn.Module): @@ -83,6 +84,7 @@ class GPTBigCodeAttention(nn.Module): total_num_kv_heads, bias=True, quant_config=quant_config, + prefix=f"{prefix}.c_attn", ) self.c_proj = RowParallelLinear( @@ -90,6 +92,7 @@ class GPTBigCodeAttention(nn.Module): self.hidden_size, bias=True, quant_config=quant_config, + prefix=f"{prefix}.c_proj", ) self.attn = Attention(self.num_heads, self.head_dim, @@ -123,6 +126,7 @@ class GPTBigMLP(nn.Module): intermediate_size: int, config: GPTBigCodeConfig, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() hidden_size = config.hidden_size @@ -131,12 +135,14 @@ class GPTBigMLP(nn.Module): intermediate_size, bias=True, quant_config=quant_config, + prefix=f"{prefix}.c_fc", ) self.c_proj = RowParallelLinear( intermediate_size, hidden_size, bias=True, quant_config=quant_config, + prefix=f"{prefix}.c_proj", ) self.act = get_act_fn(config.activation_function) @@ -167,7 +173,10 @@ class GPTBigCodeBlock(nn.Module): quant_config, prefix=f"{prefix}.attn") self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.mlp = GPTBigMLP(inner_dim, config, quant_config) + self.mlp = GPTBigMLP(inner_dim, + config, + quant_config, + prefix=f"{prefix}.mlp") def forward( self, @@ -260,7 +269,7 @@ class GPTBigCodeModel(nn.Module): weight_loader = getattr(param, "weight_loader", default_weight_loader) # TODO (@robertgshaw2-neuralmagic): move to fp8 linear method - if "c_attn.input_scale" in name or "c_attn.weight_scale" in name: + if "c_attn.input_scale" in name: weight_loader(param, loaded_weight, 'q') weight_loader(param, loaded_weight, 'k') weight_loader(param, loaded_weight, 'v') @@ -284,7 +293,8 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.quant_config = quant_config self.transformer = GPTBigCodeModel(vllm_config=vllm_config, - prefix=prefix) + prefix=maybe_prefix( + prefix, "transformer")) if self.config.tie_word_embeddings: self.lm_head = self.transformer.wte else: From 9a0c5ded5aef022d2cfd1a263cd1fecdeb6697be Mon Sep 17 00:00:00 2001 From: Kyuyeun Kim <62023335+kyuyeunk@users.noreply.github.com> Date: Fri, 8 Aug 2025 23:12:54 -0700 Subject: [PATCH 116/932] [TPU] Add support for online w8a8 quantization (#22425) Signed-off-by: Kyuyeun Kim --- .../hardware_ci/run-tpu-v1-test-part2.sh | 2 + tests/v1/tpu/test_tpu_int8.py | 73 +++++++++++++++++++ .../layers/quantization/tpu_int8.py | 10 ++- 3 files changed, 82 insertions(+), 3 deletions(-) create mode 100644 tests/v1/tpu/test_tpu_int8.py diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh index 734a817fd1..10d2e23649 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -139,6 +139,8 @@ run_and_track_test 5 "test_spmd_model_weight_loading.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" run_and_track_test 6 "test_kv_cache_update_kernel.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py" +run_and_track_test 7 "test_tpu_int8.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py" # After all tests have been attempted, exit with the overall status. if [ "$overall_script_exit_code" -ne 0 ]; then diff --git a/tests/v1/tpu/test_tpu_int8.py b/tests/v1/tpu/test_tpu_int8.py new file mode 100644 index 0000000000..991070dc92 --- /dev/null +++ b/tests/v1/tpu/test_tpu_int8.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests whether TPU Int8 computation is enabled correctly. + +Run `pytest tests/quantization/test_tpu_int8.py`. +""" +import pytest + +from vllm.model_executor.layers.linear import LinearBase +from vllm.model_executor.layers.quantization.tpu_int8 import ( + TPUInt8LinearMethod) +from vllm.platforms import current_platform + +from ...models.registry import HF_EXAMPLE_MODELS + +MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"] + + +@pytest.mark.skipif(not current_platform.is_tpu(), + reason="TPU Int8 is only enabled for TPUs.") +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("max_tokens", [10]) +@pytest.mark.parametrize( + "hf_overrides", + [ + # w8a8 dynamic activation + { + 'quantization_config': { + 'quant_method': 'tpu_int8', + 'activation_scheme': 'dynamic' + } + } + ]) +def test_model_tpu_int8(vllm_runner, model: str, dtype: str, max_tokens: int, + hf_overrides: dict, monkeypatch) -> None: + model_info = HF_EXAMPLE_MODELS.find_hf_info(model) + model_info.check_transformers_version(on_fail="skip") + + activation_scheme = hf_overrides.get('quantization_config', + {}).get('activation_scheme') + quantize_activation = activation_scheme == 'dynamic' + + # Allows using apply_model + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") + # Prevent error from re-initializing cache + monkeypatch.setenv("VLLM_XLA_CACHE_PATH", "") + + prompts = [ + "A robot may not injure a human being", + "It is only with the heart that one can see rightly;", + "The greatest glory in living lies not in never falling,", + ] + answers = [ + "or, being injured, not kill, except in", + "without the heart, one can only see wrongly.", + "but in rising every time we fall. - Nelson" + ] + + with vllm_runner(model, dtype=dtype, hf_overrides=hf_overrides) as vllm: + + def check_model(model): + for name, module in model.named_modules(): + if not isinstance(module, LinearBase): + continue + quant_method = module.quant_method + assert isinstance(quant_method, TPUInt8LinearMethod) + assert quant_method.quantize_activation == quantize_activation + + vllm.apply_model(check_model) + outputs = vllm.generate_greedy(prompts, max_tokens) + for (_, output), answer in zip(outputs, answers): + assert answer in output diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py index 83c8a98eac..38de4b54fb 100644 --- a/vllm/model_executor/layers/quantization/tpu_int8.py +++ b/vllm/model_executor/layers/quantization/tpu_int8.py @@ -13,7 +13,7 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.parameter import ModelWeightParameter -ACTIVATION_SCHEMES = ["none"] +ACTIVATION_SCHEMES = ["none", "dynamic"] class Int8TpuConfig(QuantizationConfig): @@ -61,6 +61,9 @@ class TPUInt8LinearMethod(LinearMethodBase): def __init__(self, quant_config: Int8TpuConfig): self.quant_config = quant_config + self.quantize_activation = False + if self.quant_config.activation_scheme == 'dynamic': + self.quantize_activation = True def create_weights(self, layer: Module, input_size_per_partition: int, output_partition_sizes: list[int], input_size: int, @@ -107,7 +110,7 @@ class TPUInt8LinearMethod(LinearMethodBase): x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: try: - import torch_xla.experimental.xla_quantized_matmul # noqa: F401 + import torch_xla.experimental.custom_kernel # noqa: F401 except ImportError as err: raise ImportError( "Please install torch_xla by following the instructions at " @@ -115,7 +118,8 @@ class TPUInt8LinearMethod(LinearMethodBase): "to run vLLM on TPU.") from err weight = layer.weight scale = layer.scale - out = torch.ops.xla.quantized_matmul(x, weight, scale) + out = torch.ops.xla.quantized_matmul_int8( + x, weight, scale, quantize_activation=self.quantize_activation) if bias is not None: out = out + bias return out From b7c0942b65380ab8c53ecf2657121e1c21150672 Mon Sep 17 00:00:00 2001 From: Charlie Fu Date: Sat, 9 Aug 2025 01:15:06 -0500 Subject: [PATCH 117/932] [ROCm][Misc] Rename the context_len to seq_len in ROCm custom paged attention kernel (#22097) Signed-off-by: charlifu --- csrc/rocm/attention.cu | 179 +++++++++++++++++------------------ csrc/rocm/ops.h | 4 +- csrc/rocm/torch_bindings.cpp | 4 +- 3 files changed, 91 insertions(+), 96 deletions(-) diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index 65cb1c1d14..e3a0e15f53 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -270,7 +270,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( const int num_kv_heads, const float scale, const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ seq_lens, // [num_seqs] const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] @@ -304,12 +304,12 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( const auto max_num_partitions = gridDim.y; - const int context_len = context_lens[seq_idx]; + const int seq_len = seq_lens[seq_idx]; const int partition_start_token_idx = partition_idx * T_PAR_SIZE; // partition_size; // exit if partition is out of context for seq - if (partition_start_token_idx >= context_len) { + if (partition_start_token_idx >= seq_len) { return; } @@ -361,8 +361,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( // output layout from QKmfma : QH16xT4x4 16 qheads across 16 lanes, 16 tokens // across 4 rows x 4 tokens per lane - const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE); - const int last_ctx_block = num_context_blocks - 1; + const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE); + const int last_seq_block = num_seq_blocks - 1; const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq; @@ -373,9 +373,9 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( const int klocal_token_idx = TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx; - const int kblock_idx = (kglobal_token_idx < context_len) + const int kblock_idx = (kglobal_token_idx < seq_len) ? kglobal_token_idx / BLOCK_SIZE - : last_ctx_block; + : last_seq_block; kphysical_block_number[token_depth] = block_table_seq[kblock_idx]; } @@ -476,9 +476,9 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( // tokens const int vglobal_token_idx = partition_start_token_idx + vlocal_token_idx; - const int vblock_idx = (vglobal_token_idx < context_len) + const int vblock_idx = (vglobal_token_idx < seq_len) ? vglobal_token_idx / BLOCK_SIZE - : last_ctx_block; + : last_seq_block; vphysical_block_number[vtoken_depth][vblock_depth] = block_table_seq[vblock_idx]; } @@ -554,7 +554,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( if constexpr (ALIBI_ENABLED) { for (int token_depth = 0; token_depth < TLOOP; token_depth++) { const int local_token_idx = qkout_token_idx + token_depth * 16; - const int alibi_offset = local_token_idx - context_len + 1; + const int alibi_offset = local_token_idx - seq_len + 1; for (int i = 0; i < 4; i++) { d_out[token_depth][i] += alibi_slope * (alibi_offset + i); } @@ -568,9 +568,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( for (int token_depth = 0; token_depth < TLOOP; token_depth++) { const int local_token_idx = qkout_token_idx + token_depth * 16; for (int i = 0; i < 4; i++) { - const float tmp = (local_token_idx + i < context_len) - ? d_out[token_depth][i] - : -FLT_MAX; + const float tmp = + (local_token_idx + i < seq_len) ? d_out[token_depth][i] : -FLT_MAX; qk_max = fmaxf(qk_max, tmp); } } @@ -582,7 +581,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( for (int token_depth = 0; token_depth < TLOOP; token_depth++) { const int local_token_idx = qkout_token_idx + token_depth * 16; for (int i = 0; i < 4; i++) { - const float tmp = (local_token_idx + i < context_len) + const float tmp = (local_token_idx + i < seq_len) ? __expf(d_out[token_depth][i] - qk_max) : 0.0f; d_out[token_depth][i] = tmp; @@ -780,7 +779,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( const int num_kv_heads, const float scale, const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ seq_lens, // [num_seqs] const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] @@ -809,10 +808,10 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( const auto partition_size = blockDim.x; const auto max_num_partitions = gridDim.y; - const int context_len = context_lens[seq_idx]; + const int seq_len = seq_lens[seq_idx]; const int partition_start_token_idx = partition_idx * partition_size; // exit if partition is out of context for seq - if (partition_start_token_idx >= context_len) { + if (partition_start_token_idx >= seq_len) { return; } // every 4 lanes fetch 4 different qheads @@ -855,7 +854,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( const int warp_start_token_idx = partition_start_token_idx + warpid * WARP_SIZE; - if (warp_start_token_idx >= context_len) { // warp out of context + if (warp_start_token_idx >= seq_len) { // warp out of context #pragma unroll for (int h = 0; h < GQA_RATIO4; h++) { shared_qk_max[warpid][h] = -FLT_MAX; @@ -863,8 +862,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( } } else { // warp within context - const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE); - const int last_ctx_block = num_context_blocks - 1; + const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE); + const int last_seq_block = num_seq_blocks - 1; const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq; // token id within partition @@ -873,9 +872,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( const int global_token_idx = partition_start_token_idx + local_token_idx; // fetch block number for k - const int block_idx = (global_token_idx < context_len) + const int block_idx = (global_token_idx < seq_len) ? global_token_idx / BLOCK_SIZE - : last_ctx_block; + : last_seq_block; // fetch k physical block number // int32 physical_block_number leads to overflow when multiplied with @@ -888,7 +887,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( for (int b = 0; b < VBLOCKS; b++) { const int vblock_idx = warp_start_block_idx + b; const int vblock_idx_ctx = - (vblock_idx <= last_ctx_block) ? vblock_idx : last_ctx_block; + (vblock_idx <= last_seq_block) ? vblock_idx : last_seq_block; vphysical_blocks[b] = block_table[vblock_idx_ctx]; } @@ -1057,7 +1056,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( const int lane4_token_idx = 4 * (global_token_idx >> 2); if constexpr (ALIBI_ENABLED) { - const int alibi_offset = lane4_token_idx - context_len + 1; + const int alibi_offset = lane4_token_idx - seq_len + 1; for (int h = 0; h < QHLOOP; h++) { for (int i = 0; i < 4; i++) { d_out[h][i] += alibi_slope[h] * (alibi_offset + i); @@ -1070,7 +1069,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( for (int h = 0; h < QHLOOP; h++) { qk_max[h] = -FLT_MAX; for (int i = 0; i < 4; i++) { - qk_max[h] = (lane4_token_idx + i < context_len) + qk_max[h] = (lane4_token_idx + i < seq_len) ? fmaxf(qk_max[h], d_out[h][i]) : qk_max[h]; } @@ -1101,7 +1100,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( for (int h = 0; h < QHLOOP; h++) { exp_sum[h] = 0.0f; for (int i = 0; i < 4; i++) { - d_out[h][i] = (lane4_token_idx + i < context_len) + d_out[h][i] = (lane4_token_idx + i < seq_len) ? __expf(d_out[h][i] - qk_max[h]) : 0.0f; exp_sum[h] += d_out[h][i]; @@ -1181,7 +1180,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( } } - if (warp_start_token_idx >= context_len) { // warp out of context + if (warp_start_token_idx >= seq_len) { // warp out of context for (int qh = 0; qh < QHLOOP; qh++) { for (int vh = 0; vh < VHELOOP; vh++) { vout_shared[qh][vh][laneid][warpid] = {0}; @@ -1279,7 +1278,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( // max_num_partitions] const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, // max_num_partitions, head_size] - const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ seq_lens, // [num_seqs] const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) { const auto num_heads = gridDim.x; @@ -1293,8 +1292,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( return; } - const int context_len = context_lens[seq_idx]; - const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); + const int seq_len = seq_lens[seq_idx]; + const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE); const auto warpid = threadIdx.x / WARP_SIZE; __shared__ float shared_global_exp_sum; @@ -1581,7 +1580,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( // head_size, block_size] const int num_kv_heads, const float scale, const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ seq_lens, // [num_seqs] const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] @@ -1615,11 +1614,11 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( const int max_num_partitions = gridDim.y; - const int context_len = context_lens[seq_idx]; // length of a seq + const int seq_len = seq_lens[seq_idx]; // length of a seq const int partition_start_token_idx = partition_idx * T_PAR_SIZE; // exit if partition is out of context for seq - if (partition_start_token_idx >= context_len) { + if (partition_start_token_idx >= seq_len) { return; } @@ -1715,8 +1714,8 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( } } - const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE); - const int last_ctx_block = num_context_blocks - 1; + const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE); + const int last_seq_block = num_seq_blocks - 1; const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq; @@ -1727,9 +1726,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( const int klocal_token_idx = TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx; - const int kblock_idx = (kglobal_token_idx < context_len) + const int kblock_idx = (kglobal_token_idx < seq_len) ? kglobal_token_idx / BLOCK_SIZE - : last_ctx_block; + : last_seq_block; kphysical_block_number[token_depth] = block_table_seq[kblock_idx]; } @@ -1781,9 +1780,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( vblock_depth * BLOCK_SIZE; const int vglobal_token_idx = partition_start_token_idx + vlocal_token_idx; - const int vblock_idx = (vglobal_token_idx < context_len) + const int vblock_idx = (vglobal_token_idx < seq_len) ? vglobal_token_idx / BLOCK_SIZE - : last_ctx_block; + : last_seq_block; vphysical_block_number[vtoken_depth][vblock_depth] = block_table_seq[vblock_idx]; } @@ -1836,9 +1835,8 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( for (int token_depth = 0; token_depth < TLOOP; token_depth++) { const int local_token_idx = qkout_token_idx + token_depth * 16; for (int i = 0; i < 8; i++) { - const float tmp = (local_token_idx + 2 * i < context_len) - ? dout[token_depth][i] - : -FLT_MAX; + const float tmp = + (local_token_idx + 2 * i < seq_len) ? dout[token_depth][i] : -FLT_MAX; qk_max = fmaxf(qk_max, tmp); } } @@ -1848,7 +1846,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( for (int token_depth = 0; token_depth < TLOOP; token_depth++) { const int local_token_idx = qkout_token_idx + token_depth * 16; for (int i = 0; i < 8; i++) { - const float tmp = (local_token_idx + 2 * i < context_len) + const float tmp = (local_token_idx + 2 * i < seq_len) ? __expf(dout[token_depth][i] - qk_max) : 0.0f; dout[token_depth][i] = tmp; @@ -2019,7 +2017,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( // head_size, block_size] const int num_kv_heads, const float scale, const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ seq_lens, // [num_seqs] const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] @@ -2046,7 +2044,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( // max_num_partitions] const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, // max_num_partitions, head_size] - const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ seq_lens, // [num_seqs] const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) { const auto num_heads = gridDim.x; @@ -2060,8 +2058,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( return; } - const int context_len = context_lens[seq_idx]; - const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); + const int seq_len = seq_lens[seq_idx]; + const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE); const int warpid = threadIdx.x / WARP_SIZE; __shared__ float shared_global_exp_sum; @@ -2349,7 +2347,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( // head_size, block_size] const int num_kv_heads, const float scale, const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ seq_lens, // [num_seqs] const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] @@ -2382,11 +2380,11 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( const int max_num_partitions = gridDim.y; - const int context_len = context_lens[seq_idx]; // length of a seq + const int seq_len = seq_lens[seq_idx]; // length of a seq const int partition_start_token_idx = partition_idx * T_PAR_SIZE; // exit if partition is out of context for seq - if (partition_start_token_idx >= context_len) { + if (partition_start_token_idx >= seq_len) { return; } @@ -2482,8 +2480,8 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( } } - const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE); - const int last_ctx_block = num_context_blocks - 1; + const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE); + const int last_seq_block = num_seq_blocks - 1; const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq; @@ -2494,9 +2492,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( const int klocal_token_idx = TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx; - const int kblock_idx = (kglobal_token_idx < context_len) + const int kblock_idx = (kglobal_token_idx < seq_len) ? kglobal_token_idx / BLOCK_SIZE - : last_ctx_block; + : last_seq_block; kphysical_block_number[token_depth] = block_table_seq[kblock_idx]; } @@ -2548,9 +2546,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( rowid * VTOKENS_PER_LANE + vblock_depth * BLOCK_SIZE; const int vglobal_token_idx = partition_start_token_idx + vlocal_token_idx; - const int vblock_idx = (vglobal_token_idx < context_len) + const int vblock_idx = (vglobal_token_idx < seq_len) ? vglobal_token_idx / BLOCK_SIZE - : last_ctx_block; + : last_seq_block; vphysical_block_number[vtoken_depth][vblock_depth] = block_table_seq[vblock_idx]; } @@ -2604,7 +2602,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( const int local_token_idx = qkout_token_idx + token_depth * 16; for (int i = 0; i < 8; i++) { const float tmp = - (local_token_idx + i < context_len) ? dout[token_depth][i] : -FLT_MAX; + (local_token_idx + i < seq_len) ? dout[token_depth][i] : -FLT_MAX; qk_max = fmaxf(qk_max, tmp); } } @@ -2614,7 +2612,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( for (int token_depth = 0; token_depth < TLOOP; token_depth++) { const int local_token_idx = qkout_token_idx + token_depth * 16; for (int i = 0; i < 8; i++) { - const float tmp = (local_token_idx + i < context_len) + const float tmp = (local_token_idx + i < seq_len) ? __expf(dout[token_depth][i] - qk_max) : 0.0f; dout[token_depth][i] = tmp; @@ -2751,7 +2749,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( // head_size, block_size] const int num_kv_heads, const float scale, const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ seq_lens, // [num_seqs] const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] @@ -2778,7 +2776,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( // max_num_partitions] const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, // max_num_partitions, head_size] - const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ seq_lens, // [num_seqs] const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) { const auto num_heads = gridDim.x; @@ -2792,8 +2790,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( return; } - const int context_len = context_lens[seq_idx]; - const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); + const int seq_len = seq_lens[seq_idx]; + const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE); const int warpid = threadIdx.x / WARP_SIZE; __shared__ float shared_global_exp_sum; @@ -2980,7 +2978,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel( const int num_kv_heads, const float scale, const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ seq_lens, // [num_seqs] const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] @@ -3007,7 +3005,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( const int num_kv_heads, const float scale, const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ seq_lens, // [num_seqs] const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] @@ -3031,7 +3029,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( const float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions] const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] - const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ seq_lens, // [num_seqs] const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) { UNREACHABLE_CODE @@ -3046,7 +3044,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( GQA_RATIO> \ <<>>( \ query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \ - block_tables_ptr, context_lens_ptr, query_start_loc_ptr, \ + block_tables_ptr, seq_lens_ptr, query_start_loc_ptr, \ max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \ kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, \ max_ctx_blocks, k_scale_ptr, v_scale_ptr); @@ -3057,18 +3055,17 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( GQA_RATIO> \ <<>>( \ query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \ - block_tables_ptr, context_lens_ptr, query_start_loc_ptr, \ + block_tables_ptr, seq_lens_ptr, query_start_loc_ptr, \ max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \ kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, \ max_ctx_blocks, k_scale_ptr, v_scale_ptr); -#define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS) \ - paged_attention_ll4mi_reduce_kernel \ - <<>>( \ - out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, \ - context_lens_ptr, query_start_loc_ptr, max_num_partitions, \ - fp8_out_scale_ptr); +#define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS) \ + paged_attention_ll4mi_reduce_kernel \ + <<>>( \ + out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \ + query_start_loc_ptr, max_num_partitions, fp8_out_scale_ptr); template & query_start_loc, int max_context_len, + torch::Tensor& block_tables, torch::Tensor& seq_lens, + const std::optional& query_start_loc, int max_seq_len, const std::optional& alibi_slopes, torch::Tensor& k_scale, torch::Tensor& v_scale, const std::optional& fp8_out_scale) { int num_seqs = block_tables.size(0); @@ -3109,7 +3106,7 @@ void paged_attention_custom_launcher( KVT* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); KVT* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); int* block_tables_ptr = block_tables.data_ptr(); - int* context_lens_ptr = context_lens.data_ptr(); + int* seq_lens_ptr = seq_lens.data_ptr(); const float* k_scale_ptr = reinterpret_cast(k_scale.data_ptr()); const float* v_scale_ptr = reinterpret_cast(v_scale.data_ptr()); // NOTE: fp8_out_scale is optional. @@ -3119,13 +3116,12 @@ void paged_attention_custom_launcher( : nullptr; OUTT* out_ptr = reinterpret_cast(out.data_ptr()); - const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE); + const int max_ctx_blocks = DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE); // partition size is fixed at 256 since both mfma4 and mfma16 kernels support // it mfma4 kernel also supports partition size 512 constexpr int PARTITION_SIZE = 256; - const int max_num_partitions = - DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE); + const int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE); const int gqa_ratio = num_heads / num_kv_heads; assert(num_heads % num_kv_heads == 0); assert(head_size == HEAD_SIZE); @@ -3234,8 +3230,8 @@ void paged_attention_custom_launcher_navi( torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, const int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& context_lens, - const std::optional& query_start_loc, int max_context_len, + torch::Tensor& block_tables, torch::Tensor& seq_lens, + const std::optional& query_start_loc, int max_seq_len, const std::optional& alibi_slopes, torch::Tensor& k_scale, torch::Tensor& v_scale) { int num_seqs = block_tables.size(0); @@ -3263,7 +3259,7 @@ void paged_attention_custom_launcher_navi( KVT* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); KVT* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); int* block_tables_ptr = block_tables.data_ptr(); - int* context_lens_ptr = context_lens.data_ptr(); + int* seq_lens_ptr = seq_lens.data_ptr(); const float* k_scale_ptr = reinterpret_cast(k_scale.data_ptr()); const float* v_scale_ptr = reinterpret_cast(v_scale.data_ptr()); @@ -3271,11 +3267,10 @@ void paged_attention_custom_launcher_navi( const auto fp8_out_scale_ptr = nullptr; OUTT* out_ptr = reinterpret_cast(out.data_ptr()); - const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE); + const int max_ctx_blocks = DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE); constexpr int PARTITION_SIZE = 256; - const int max_num_partitions = - DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE); + const int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE); const int gqa_ratio = num_heads / num_kv_heads; assert(num_heads % num_kv_heads == 0); assert(head_size == HEAD_SIZE); @@ -3407,14 +3402,14 @@ void paged_attention_custom_launcher_navi( paged_attention_custom_launcher( \ out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ - num_kv_heads, scale, block_tables, context_lens, query_start_loc, \ - max_context_len, alibi_slopes, k_scale, v_scale, fp8_out_scale); \ + num_kv_heads, scale, block_tables, seq_lens, query_start_loc, \ + max_seq_len, alibi_slopes, k_scale, v_scale, fp8_out_scale); \ } else { \ paged_attention_custom_launcher_navi< \ T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, ALIBI_ENABLED>( \ out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ - num_kv_heads, scale, block_tables, context_lens, query_start_loc, \ - max_context_len, alibi_slopes, k_scale, v_scale); \ + num_kv_heads, scale, block_tables, seq_lens, query_start_loc, \ + max_seq_len, alibi_slopes, k_scale, v_scale); \ } #define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \ @@ -3502,9 +3497,9 @@ void paged_attention( int64_t num_kv_heads, double scale, torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] - torch::Tensor& context_lens, // [num_seqs] + torch::Tensor& seq_lens, // [num_seqs] const std::optional& query_start_loc, // [num_seqs] - int64_t block_size, int64_t max_context_len, + int64_t block_size, int64_t max_seq_len, const std::optional& alibi_slopes, const std::string& kv_cache_dtype, torch::Tensor& k_scale, torch::Tensor& v_scale, diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h index e538197dbc..34dcc9401a 100644 --- a/csrc/rocm/ops.h +++ b/csrc/rocm/ops.h @@ -15,8 +15,8 @@ void paged_attention( torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, - torch::Tensor& block_tables, torch::Tensor& context_lens, + torch::Tensor& block_tables, torch::Tensor& seq_lens, const std::optional& query_start_loc, int64_t block_size, - int64_t max_context_len, const std::optional& alibi_slopes, + int64_t max_seq_len, const std::optional& alibi_slopes, const std::string& kv_cache_dtype, torch::Tensor& k_scale, torch::Tensor& v_scale, const std::optional& fp8_out_scale); diff --git a/csrc/rocm/torch_bindings.cpp b/csrc/rocm/torch_bindings.cpp index 34575477bc..66bdc448da 100644 --- a/csrc/rocm/torch_bindings.cpp +++ b/csrc/rocm/torch_bindings.cpp @@ -41,10 +41,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) { " Tensor query, Tensor key_cache," " Tensor value_cache, int num_kv_heads," " float scale, Tensor block_tables," - " Tensor context_lens," + " Tensor seq_lens," " Tensor? query_start_loc," " int block_size," - " int max_context_len," + " int max_seq_len," " Tensor? alibi_slopes," " str kv_cache_dtype," " Tensor k_scale, Tensor v_scale," From 7920e9b1c5e168fe6218d2d147bdb9acf6bc993d Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 9 Aug 2025 15:03:26 +0800 Subject: [PATCH 118/932] [Bugfix] Fix failing GPT-OSS initialization test (#22557) Signed-off-by: Isotr0py --- tests/models/registry.py | 2 +- tests/models/test_initialization.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 2bb06b7d19..64eeed6555 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -200,7 +200,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { {"6b": "EleutherAI/gpt-j-6b"}), "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m", {"1b": "EleutherAI/pythia-1.4b"}), - "GptOssForCausalLM": _HfExamplesInfo("openai/gpt-oss-20b"), + "GptOssForCausalLM": _HfExamplesInfo("lmsys/gpt-oss-20b-bf16"), "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"), "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"), "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"), # noqa: E501 diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index f0aa91566b..f06b34285e 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -68,6 +68,11 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, if model_arch == "Phi4FlashForCausalLM": # Phi4FlashForCausalLM only supports DIFFERENTIAL_FLASH_ATTN backend m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN") + if model_arch == "GptOssForCausalLM": + # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU + # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when + # L4 supports FA3. + m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1") LLM( model_info.default, tokenizer=model_info.tokenizer, From 0edc0cd52b68d293250157226abdf631e52a53a3 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 9 Aug 2025 15:03:29 +0800 Subject: [PATCH 119/932] [Bugfix] Fix CI moe kernel failure (#22556) Signed-off-by: Jee Jee Li --- .../moe/test_gpt_oss_triton_kernels.py | 204 ++++++++++++------ 1 file changed, 141 insertions(+), 63 deletions(-) diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py index 3f9b32ce5a..54f2351bf6 100644 --- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py +++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py @@ -5,6 +5,15 @@ from dataclasses import dataclass, fields import pytest import torch import torch.nn.functional as F + +from vllm.utils import has_triton_kernels + +if not has_triton_kernels(): + pytest.skip( + "triton_kernels not found, skipping all related tests", + allow_module_level=True, + ) + import triton_kernels.swiglu from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig from triton_kernels.numerics import InFlexData @@ -65,7 +74,7 @@ def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int): dtype_dict = { "bf16": torch.bfloat16, "fp8_e4m3": torch.float8_e4m3fn, - "fp8_e5m2": torch.float8_e5m2 + "fp8_e5m2": torch.float8_e5m2, } x = x.to(dtype_dict[a_dtype]).to(torch.bfloat16) @@ -97,12 +106,18 @@ def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int): x_pad = w1_bottom_pad - w1_tri = F.pad(w1_tri, (0, w1_right_pad, 0, w1_bottom_pad, 0, 0), - mode="constant", - value=0) - w2_tri = F.pad(w2_tri, (0, w2_right_pad, 0, w2_bottom_pad, 0, 0), - mode="constant", - value=0) + w1_tri = F.pad( + w1_tri, + (0, w1_right_pad, 0, w1_bottom_pad, 0, 0), + mode="constant", + value=0, + ) + w2_tri = F.pad( + w2_tri, + (0, w2_right_pad, 0, w2_bottom_pad, 0, 0), + mode="constant", + value=0, + ) w1_bias_tri = F.pad(w1_bias_tri, (0, w1_right_pad, 0, 0), mode="constant", @@ -127,13 +142,19 @@ def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int): w1_tri = convert_layout(wrap_torch_tensor(w1_tri, FP4), w_layout, **w_layout_opts) - w1_scale_tri = convert_layout(wrap_torch_tensor(w1_scale_tri), - w_scale_layout, **w_scale_layout_opts) + w1_scale_tri = convert_layout( + wrap_torch_tensor(w1_scale_tri), + w_scale_layout, + **w_scale_layout_opts, + ) w2_tri = convert_layout(wrap_torch_tensor(w2_tri, FP4), w_layout, **w_layout_opts) - w2_scale_tri = convert_layout(wrap_torch_tensor(w2_scale_tri), - w_scale_layout, **w_scale_layout_opts) + w2_scale_tri = convert_layout( + wrap_torch_tensor(w2_scale_tri), + w_scale_layout, + **w_scale_layout_opts, + ) pc1 = PrecisionConfig(weight_scale=w1_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())) @@ -149,8 +170,22 @@ def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int): w1 = w1.transpose(-1, -2).contiguous() w2 = w2.transpose(-1, -2).contiguous() - return (x, w1, w1_bias, w2, w2_bias, exp_data, x_tri, w1_tri, w2_tri, - exp_data_tri, w1_bias_tri, w2_bias_tri, pc1, pc2) + return ( + x, + w1, + w1_bias, + w2, + w2_bias, + exp_data, + x_tri, + w1_tri, + w2_tri, + exp_data_tri, + w1_bias_tri, + w2_bias_tri, + pc1, + pc2, + ) @dataclass @@ -184,13 +219,14 @@ def swiglu(x, alpha: float = 1.702, limit: float = 1.0): def oai_moe_forward( - hidden_states: torch.Tensor, # (M, K) - w1: torch.Tensor, # (E, 2N) - w1_bias: torch.Tensor, # (E, 2N, K) - w2: torch.Tensor, # (E, K, N) - w2_bias: torch.Tensor, # (E, N) - gating_output: torch.Tensor, # (M, E) - topk: int): + hidden_states: torch.Tensor, # (M, K) + w1: torch.Tensor, # (E, 2N) + w1_bias: torch.Tensor, # (E, 2N, K) + w2: torch.Tensor, # (E, K, N) + w2_bias: torch.Tensor, # (E, N) + gating_output: torch.Tensor, # (M, E) + topk: int, +): # model.py 309:330, assuming gating and norm t = hidden_states experts = torch.topk(gating_output, k=topk, dim=-1, sorted=True) @@ -240,10 +276,22 @@ def test_equiv(num_token, a_dtype, w_dtype, tp): N = ModelConfig.intermediate_size // tp topk = ModelConfig.experts_per_token - x, w1, w1_bias, w2, w2_bias, exp_data, \ - x_tri, w1_tri, w2_tri, exp_data_tri, w1_bias_tri,\ - w2_bias_tri, pc1, pc2 = init_compute_data( - M, K, N, E, a_dtype, w_dtype, num_warps=8) + ( + x, + w1, + w1_bias, + w2, + w2_bias, + exp_data, + x_tri, + w1_tri, + w2_tri, + exp_data_tri, + w1_bias_tri, + w2_bias_tri, + pc1, + pc2, + ) = init_compute_data(M, K, N, E, a_dtype, w_dtype, num_warps=8) out_triton_monolithic = triton_kernel_moe_forward( hidden_states=x_tri, @@ -255,33 +303,46 @@ def test_equiv(num_token, a_dtype, w_dtype, tp): w1_bias=w1_bias_tri, w2_bias=w2_bias_tri, w1_precision=pc1, - w2_precision=pc2) + w2_precision=pc2, + ) out_triton_monolithic = out_triton_monolithic[..., :K] - out_ref = oai_moe_forward(hidden_states=x, - w1=w1, - w1_bias=w1_bias, - w2=w2, - w2_bias=w2_bias, - gating_output=exp_data, - topk=topk) + out_ref = oai_moe_forward( + hidden_states=x, + w1=w1, + w1_bias=w1_bias, + w2=w2, + w2_bias=w2_bias, + gating_output=exp_data, + topk=topk, + ) assert_close(ref=out_ref, tri=out_triton_monolithic, maxtol=0.025, rmstol=0.005) -def batched_moe(a: torch.Tensor, w1, w2, gating_output: torch.Tensor, - topk: int, renormalize: bool, w1_bias: torch.Tensor, - w2_bias: torch.Tensor, w1_precision: PrecisionConfig, - w2_precision: PrecisionConfig) -> torch.Tensor: +def batched_moe( + a: torch.Tensor, + w1, + w2, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + w1_bias: torch.Tensor, + w2_bias: torch.Tensor, + w1_precision: PrecisionConfig, + w2_precision: PrecisionConfig, +) -> torch.Tensor: max_num_tokens = round_up(a.shape[0], 64) fused_experts = FusedMoEModularKernel( - BatchedPrepareAndFinalize(max_num_tokens, - num_dispatchers=1, - num_local_experts=w1.shape[0], - rank=0), + BatchedPrepareAndFinalize( + max_num_tokens, + num_dispatchers=1, + num_local_experts=w1.shape[0], + rank=0, + ), BatchedOAITritonExperts( None, max_num_tokens=max_num_tokens, @@ -327,30 +388,46 @@ def test_triton_kernel_batched_moe(num_token, a_dtype, w_dtype, ep): N = ModelConfig.intermediate_size topk = ModelConfig.experts_per_token - x, w1, w1_bias, w2, w2_bias, exp_data, \ - x_tri, w1_tri, w2_tri, exp_data_tri, w1_bias_tri, \ - w2_bias_tri, pc1, pc2 = init_compute_data( - M, K, N, E, a_dtype, w_dtype, num_warps=4) + ( + x, + w1, + w1_bias, + w2, + w2_bias, + exp_data, + x_tri, + w1_tri, + w2_tri, + exp_data_tri, + w1_bias_tri, + w2_bias_tri, + pc1, + pc2, + ) = init_compute_data(M, K, N, E, a_dtype, w_dtype, num_warps=4) - out_tri = batched_moe(a=x_tri, - w1=w1_tri, - w2=w2_tri, - gating_output=exp_data_tri, - topk=topk, - renormalize=True, - w1_bias=w1_bias_tri, - w2_bias=w2_bias_tri, - w1_precision=pc1, - w2_precision=pc2) + out_tri = batched_moe( + a=x_tri, + w1=w1_tri, + w2=w2_tri, + gating_output=exp_data_tri, + topk=topk, + renormalize=True, + w1_bias=w1_bias_tri, + w2_bias=w2_bias_tri, + w1_precision=pc1, + w2_precision=pc2, + ) out_tri = out_tri[..., :K] - out_ref = oai_moe_forward(hidden_states=x, - w1=w1, - w1_bias=w1_bias, - w2=w2, - w2_bias=w2_bias, - gating_output=exp_data, - topk=topk) + out_ref = oai_moe_forward( + hidden_states=x, + w1=w1, + w1_bias=w1_bias, + w2=w2, + w2_bias=w2_bias, + gating_output=exp_data, + topk=topk, + ) assert_close(ref=out_ref, tri=out_tri, maxtol=0.025, rmstol=0.005) @@ -370,6 +447,7 @@ def test_unit_shuffle(): out = triton_kernels.swiglu.swiglu_torch( out, alpha=1.702, - precision_config=triton_kernels.swiglu.PrecisionConfig(limit=1.0)) + precision_config=triton_kernels.swiglu.PrecisionConfig(limit=1.0), + ) - assert_close(ref=out_ref, tri=out) \ No newline at end of file + assert_close(ref=out_ref, tri=out) From 2be07a0db115e65009111145e17b034c54ae4a01 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Sat, 9 Aug 2025 09:18:18 +0200 Subject: [PATCH 120/932] Update docs for Minimax-Text support (#22562) Signed-off-by: Thomas Parnell --- docs/models/supported_models.md | 4 ++-- docs/usage/v1_guide.md | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index afabfccb55..87dd08e059 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -404,8 +404,8 @@ th { | `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | ✅︎ | | `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | | -| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | | +| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | ✅︎ | +| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | ✅︎ | | `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ | !!! note diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index d30144e8a8..a9492c8502 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -111,6 +111,10 @@ Models that combine Mamba-2 and Mamba-1 layers with standard attention layers ar `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that these models currently require disabling prefix caching and using the FlashInfer attention backend in V1. +Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`). +Please note that these models currently require disabling prefix caching, enforcing eager mode, and using the FlashInfer +attention backend in V1. + #### Encoder-Decoder Models Models requiring cross-attention between separate encoder and decoder (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`) From a6022e6fbcbdba65e3c0e6dce5c9e3cbc8120e90 Mon Sep 17 00:00:00 2001 From: Yuxuan Zhang <2448370773@qq.com> Date: Sat, 9 Aug 2025 15:50:21 +0800 Subject: [PATCH 121/932] GLM-4.5V with new class name at transformers (#22520) Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> Signed-off-by: Isotr0py Co-authored-by: Isotr0py --- docs/models/supported_models.md | 4 ++-- tests/models/registry.py | 2 +- vllm/model_executor/models/glm4_moe.py | 8 +++++++- vllm/model_executor/models/registry.py | 2 +- vllm/transformers_utils/config.py | 3 ++- 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 87dd08e059..19186a0635 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -352,6 +352,7 @@ th { | `Gemma3nForConditionalGeneration` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | | `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Glm4MoeForCausalLM` | GLM-4.5 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ | | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ | @@ -609,8 +610,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ | | `GLM4VForCausalLM`^ | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4MoeForCausalLM` | GLM-4.5 | T + IE+ + VE+ | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4v_moeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index 64eeed6555..09d62413fe 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -396,7 +396,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"), # noqa: E501 - "Glm4v_moeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V", + "Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V", is_available_online=False), # noqa: E501 "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m", trust_remote_code=True, diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 624eef6cf1..131c042c3c 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -372,7 +372,13 @@ class Glm4MoeDecoderLayer(nn.Module): return hidden_states, residual -@support_torch_compile +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + }) class Glm4MoeModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index c746e8ec3f..4aa958ecdc 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -207,7 +207,7 @@ _MULTIMODAL_MODELS = { "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501 "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"), "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501 - "Glm4v_moeForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501 + "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501 "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"), # noqa: E501 "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"), diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index bce24ef74c..de779f94a4 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -254,7 +254,8 @@ def _uses_mrope(config: PretrainedConfig) -> bool: def uses_mrope(config: PretrainedConfig) -> bool: """Detect if the model with this config uses M-ROPE.""" - return _uses_mrope(config) or thinker_uses_mrope(config) + return _uses_mrope(config) or _uses_mrope( + config.get_text_config()) or thinker_uses_mrope(config) def thinker_uses_mrope(config: PretrainedConfig) -> bool: From 1bf5e1f25b92423f5739ea7cbb9266f61af12b0b Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Sat, 9 Aug 2025 11:04:42 +0200 Subject: [PATCH 122/932] [CI] [Hybrid] Speed up hybrid models test by removing large models (#22563) Signed-off-by: Thomas Parnell --- .../models/language/generation/test_hybrid.py | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 8c3e1f5c2b..4934da9517 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -20,7 +20,7 @@ pytestmark = pytest.mark.hybrid_model SSM_MODELS = [ "state-spaces/mamba-130m-hf", "tiiuae/falcon-mamba-tiny-dev", - "mistralai/Mamba-Codestral-7B-v0.1", + "yujiepan/mamba2-codestral-v0.1-tiny-random", ] HYBRID_MODELS = [ @@ -29,8 +29,6 @@ HYBRID_MODELS = [ # "pfnet/plamo-2-1b", "Zyphra/Zamba2-1.2B-instruct", "hmellor/tiny-random-BambaForCausalLM", - "ibm-ai-platform/Bamba-9B-v1", - "nvidia/Nemotron-H-8B-Base-8K", "ibm-granite/granite-4.0-tiny-preview", "tiiuae/Falcon-H1-0.5B-Base", ] @@ -40,23 +38,18 @@ HF_UNSUPPORTED_MODELS = [ # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test # doesn't compare vLLM output with HF output. # See https://github.com/huggingface/transformers/pull/35943 - "mistralai/Mamba-Codestral-7B-v0.1", - # Note: I'm not seeing the same output from vLLM V0 vs. HF transformers - # for Nemotron-H-8B; currently only compare vLLM V0 vs. vLLM V1 - "nvidia/Nemotron-H-8B-Base-8K", - # NOTE: Currently the test fails due to HF transformers issue fixed in: - # https://github.com/huggingface/transformers/pull/39033 - # We will enable vLLM test for Granite after next HF transformers release. - "ibm-granite/granite-4.0-tiny-preview", + "yujiepan/mamba2-codestral-v0.1-tiny-random", + # transformers 4.55 is still producing garbage for this model + # TODO(tdoublep): follow-up on transformers side + "ibm-granite/granite-4.0-tiny-preview" ] V1_SUPPORTED_MODELS = [ "state-spaces/mamba-130m-hf", "ai21labs/Jamba-tiny-dev", - "mistralai/Mamba-Codestral-7B-v0.1", - "ibm-ai-platform/Bamba-9B-v1", + "yujiepan/mamba2-codestral-v0.1-tiny-random", "Zyphra/Zamba2-1.2B-instruct", - "nvidia/Nemotron-H-8B-Base-8K", + "hmellor/tiny-random-BambaForCausalLM", "ibm-granite/granite-4.0-tiny-preview", "tiiuae/Falcon-H1-0.5B-Base", ] From 56186474f6afef825943fb5c5b1ad288909b6783 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 9 Aug 2025 16:31:32 +0100 Subject: [PATCH 123/932] [Docs] Reduce noise in docs and `--help` from the JSON tip (#22567) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/cli/README.md | 10 ++++++++++ docs/configuration/engine_args.md | 10 ++++++++++ vllm/engine/arg_utils.py | 23 ++--------------------- vllm/utils/__init__.py | 21 ++++++++++++++++++--- 4 files changed, 40 insertions(+), 24 deletions(-) diff --git a/docs/cli/README.md b/docs/cli/README.md index a7de6d7192..b512a4f4ba 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -16,6 +16,16 @@ Available Commands: vllm {chat,complete,serve,bench,collect-env,run-batch} ``` +When passing JSON CLI arguments, the following sets of arguments are equivalent: + +- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'` +- `--json-arg.key1 value1 --json-arg.key2.key3 value2` + +Additionally, list elements can be passed individually using `+`: + +- `--json-arg '{"key4": ["value3", "value4", "value5"]}'` +- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'` + ## serve Start the vLLM OpenAI Compatible API server. diff --git a/docs/configuration/engine_args.md b/docs/configuration/engine_args.md index c3c1d5a1c3..e7ca08b557 100644 --- a/docs/configuration/engine_args.md +++ b/docs/configuration/engine_args.md @@ -11,6 +11,16 @@ Engine arguments control the behavior of the vLLM engine. The engine argument classes, [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs], are a combination of the configuration classes defined in [vllm.config][]. Therefore, if you are interested in developer documentation, we recommend looking at these configuration classes as they are the source of truth for types, defaults and docstrings. +When passing JSON CLI arguments, the following sets of arguments are equivalent: + +- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'` +- `--json-arg.key1 value1 --json-arg.key2.key3 value2` + +Additionally, list elements can be passed individually using `+`: + +- `--json-arg '{"key4": ["value3", "value4", "value5"]}'` +- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'` + ## `EngineArgs` --8<-- "docs/argparse/engine_args.md" diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c9dc99cad2..4d4ce4c78e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -178,17 +178,8 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]: kwargs[name] = {"default": default, "help": help} # Set other kwargs based on the type hints - json_tip = """Should either be a valid JSON string or JSON keys -passed individually. For example, the following sets of arguments are -equivalent: - -- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n -- `--json-arg.key1 value1 --json-arg.key2.key3 value2` - -Additionally, list elements can be passed individually using `+`: - -- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`\n -- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`""" + json_tip = ("Should either be a valid JSON string or JSON keys passed " + "individually.") if dataclass_cls is not None: def parse_dataclass(val: str, cls=dataclass_cls) -> Any: @@ -1831,13 +1822,3 @@ def human_readable_int(value): # Regular plain number. return int(value) - - -# These functions are used by sphinx to build the documentation -def _engine_args_parser(): - return EngineArgs.add_cli_args(FlexibleArgumentParser()) - - -def _async_engine_args_parser(): - return AsyncEngineArgs.add_cli_args(FlexibleArgumentParser(), - async_args_only=True) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 7a0abf5b59..a4997226ea 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1669,11 +1669,19 @@ class FlexibleArgumentParser(ArgumentParser): """ArgumentParser that allows both underscore and dash in names.""" _deprecated: set[Action] = set() + _json_tip: str = ( + "When passing JSON CLI arguments, the following sets of arguments " + "are equivalent:\n" + ' --json-arg \'{"key1": "value1", "key2": {"key3": "value2"}}\'\n' + " --json-arg.key1 value1 --json-arg.key2.key3 value2\n\n" + "Additionally, list elements can be passed individually using +:\n" + ' --json-arg \'{"key4": ["value3", "value4", "value5"]}\'\n' + " --json-arg.key4+ value3 --json-arg.key4+=\'value4,value5\'\n\n") def __init__(self, *args, **kwargs): - # Set the default 'formatter_class' to SortedHelpFormatter - if 'formatter_class' not in kwargs: - kwargs['formatter_class'] = SortedHelpFormatter + # Set the default "formatter_class" to SortedHelpFormatter + if "formatter_class" not in kwargs: + kwargs["formatter_class"] = SortedHelpFormatter super().__init__(*args, **kwargs) if sys.version_info < (3, 13): @@ -1715,6 +1723,13 @@ class FlexibleArgumentParser(ArgumentParser): self._action_groups.append(group) return group + def format_help(self) -> str: + # Add tip about JSON arguments to the epilog + epilog = self.epilog or "" + if not epilog.startswith(FlexibleArgumentParser._json_tip): + self.epilog = FlexibleArgumentParser._json_tip + epilog + return super().format_help() + def parse_args( # type: ignore[override] self, args: list[str] | None = None, From 2d18256e47805bf32d1ae04ba1a8c9fd98261fcf Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 9 Aug 2025 16:33:46 +0100 Subject: [PATCH 124/932] Move `ParallelConfig` from `config/__init__.py` to `config/parallel.py` (#22565) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .github/CODEOWNERS | 2 +- vllm/config/__init__.py | 357 +---------------------------------- vllm/config/compilation.py | 2 +- vllm/config/parallel.py | 375 +++++++++++++++++++++++++++++++++++++ 4 files changed, 379 insertions(+), 357 deletions(-) create mode 100644 vllm/config/parallel.py diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 5bc9442967..0a7f8e8be4 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -20,7 +20,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson # Any change to the VllmConfig changes can have a large user-facing impact, # so spam a lot of people -/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor +/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor # vLLM V1 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 69c05b75d3..7efab23f14 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -25,13 +25,13 @@ from pydantic import (ConfigDict, SkipValidation, field_validator, model_validator) from pydantic.dataclasses import dataclass from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE -from torch.distributed import ProcessGroup, ReduceOp from typing_extensions import Self, assert_never, runtime_checkable import vllm.envs as envs from vllm import version from vllm.config.compilation import (CompilationConfig, CompilationLevel, PassConfig) +from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig from vllm.config.utils import ConfigType, config from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationMethods @@ -50,20 +50,16 @@ from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes, LayerBlockType, LazyLoader, common_broadcastable_dtype, - cuda_device_count_stateless, get_cpu_memory, - get_open_port, random_uuid) + get_cpu_memory, random_uuid) # yapf: enable if TYPE_CHECKING: from _typeshed import DataclassInstance - from ray.runtime_env import RuntimeEnv - from ray.util.placement_group import PlacementGroup from transformers.configuration_utils import PretrainedConfig import vllm.model_executor.layers.quantization as me_quant import vllm.model_executor.models as me_models - from vllm.executor.executor_base import ExecutorBase from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) @@ -73,10 +69,7 @@ if TYPE_CHECKING: HfOverrides = Union[dict, Callable[[type], type]] else: DataclassInstance = Any - PlacementGroup = Any - RuntimeEnv = Any PretrainedConfig = Any - ExecutorBase = Any QuantizationConfig = Any QuantizationMethods = Any BaseModelLoader = Any @@ -2043,352 +2036,6 @@ class LoadConfig: self.ignore_patterns = ["original/**/*"] -DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"] - - -@config -@dataclass -class ParallelConfig: - """Configuration for the distributed execution.""" - - pipeline_parallel_size: int = 1 - """Number of pipeline parallel groups.""" - tensor_parallel_size: int = 1 - """Number of tensor parallel groups.""" - data_parallel_size: int = 1 - """Number of data parallel groups. MoE layers will be sharded according to - the product of the tensor parallel size and data parallel size.""" - data_parallel_size_local: int = 1 - """Number of local data parallel groups.""" - data_parallel_rank: int = 0 - """Rank of the data parallel group.""" - data_parallel_rank_local: Optional[int] = None - """Local rank of the data parallel group, - set only in SPMD mode.""" - data_parallel_master_ip: str = "127.0.0.1" - """IP of the data parallel master.""" - data_parallel_rpc_port: int = 29550 - """Port for data parallel messaging.""" - data_parallel_master_port: int = 29500 - """Port of the data parallel master.""" - data_parallel_backend: str = "mp" - """Backend to use for data parallel, either "mp" or "ray".""" - data_parallel_external_lb: bool = False - """Whether to use "external" DP LB mode. Applies only to online serving - and when data_parallel_size > 0. This is useful for a "one-pod-per-rank" - wide-EP setup in Kuberentes. Set implicitly when --data-parallel-rank - is provided explicitly to vllm serve.""" - data_parallel_hybrid_lb: bool = False - """Whether to use "hybrid" DP LB mode. Applies only to online serving - and when data_parallel_size > 0. Enables running an AsyncLLM - and API server on a "per-node" basis where vLLM load balances - between local data parallel ranks, but an external LB balances - between vLLM nodes/replicas. Set explicitly in conjunction with - --data-parallel-start-rank.""" - enable_expert_parallel: bool = False - """Use expert parallelism instead of tensor parallelism for MoE layers.""" - enable_eplb: bool = False - """Enable expert parallelism load balancing for MoE layers.""" - num_redundant_experts: int = 0 - """Number of redundant experts to use for expert parallelism.""" - eplb_window_size: int = 1000 - """Window size for expert load recording.""" - eplb_step_interval: int = 3000 - """ - Interval for rearranging experts in expert parallelism. - - Note that if this is greater than the EPLB window size, only the metrics - of the last `eplb_window_size` steps will be used for rearranging experts. - """ - eplb_log_balancedness: bool = False - """ - Log the balancedness each step of expert parallelism. - This is turned off by default since it will cause communication overhead. - """ - - max_parallel_loading_workers: Optional[int] = None - """Maximum number of parallel loading workers when loading model - sequentially in multiple batches. To avoid RAM OOM when using tensor - parallel and large models.""" - - disable_custom_all_reduce: bool = False - """Disable the custom all-reduce kernel and fall back to NCCL.""" - - ray_workers_use_nsight: bool = False - """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.""" - - ray_runtime_env: Optional["RuntimeEnv"] = None - """Ray runtime environment to pass to distributed workers.""" - - placement_group: Optional["PlacementGroup"] = None - """ray distributed model workers placement group.""" - - distributed_executor_backend: Optional[Union[DistributedExecutorBackend, - type["ExecutorBase"]]] = None - """Backend to use for distributed model - workers, either "ray" or "mp" (multiprocessing). If the product - of pipeline_parallel_size and tensor_parallel_size is less than - or equal to the number of GPUs available, "mp" will be used to - keep processing on a single host. Otherwise, this will default - to "ray" if Ray is installed and fail otherwise. Note that tpu - only support Ray for distributed inference.""" - - worker_cls: str = "auto" - """The full name of the worker class to use. If "auto", the worker class - will be determined based on the platform.""" - sd_worker_cls: str = "auto" - """The full name of the worker class to use for speculative decoding. - If "auto", the worker class will be determined based on the platform.""" - worker_extension_cls: str = "" - """The full name of the worker extension class to use. The worker extension - class is dynamically inherited by the worker class. This is used to inject - new attributes and methods to the worker class for use in collective_rpc - calls.""" - - world_size: int = field(init=False) - """world_size is TPxPP, it affects the number of workers we create.""" - - rank: int = 0 - """Global rank in distributed setup.""" - - enable_multimodal_encoder_data_parallel: bool = False - """ Use data parallelism instead of tensor parallelism for vision encoder. - Only support LLama4 for now""" - - @property - def world_size_across_dp(self) -> int: - """world_size_across_dp is TPxPPxDP, it is the size of the world - including data parallelism.""" - return self.world_size * self.data_parallel_size - - def get_next_dp_init_port(self) -> int: - """ - We might need to initialize process groups in multiple - processes that is related to data parallelism, - e.g. both in the worker and in the engine, which - can live in different processes. To avoid port conflicts, we - increment the port number each time we need to initialize a - new process group related to data parallelism. - """ - answer = self.data_parallel_master_port - self.data_parallel_master_port += 1 - return answer - - def stateless_init_dp_group(self) -> "ProcessGroup": - # NOTE: In high-concurrency scenarios multiple processes - # can pick the same (currently free) port through a race - # condition when calling `get_open_port()`. When the first - # process binds the port the others will subsequently fail - # with `torch.distributed.DistNetworkError: EADDRINUSE`. - # To make the initialization more robust we retry a few times - # with a fresh port whenever this specific error is observed. - from torch.distributed import DistNetworkError - - from vllm.distributed.utils import ( - stateless_init_torch_distributed_process_group) - - max_retries = 5 - last_exc: Optional[Exception] = None - for _ in range(max_retries): - try: - # use gloo since the engine process might not have cuda device - return stateless_init_torch_distributed_process_group( - self.data_parallel_master_ip, - self.get_next_dp_init_port(), - self.data_parallel_rank, - self.data_parallel_size, - backend="gloo") - except DistNetworkError as e: - # We only want to retry when the root cause is EADDRINUSE. - if "EADDRINUSE" in str(e): - logger.warning( - "Address already in use. Retrying with a new port.") - last_exc = e - continue # try again with a new port - raise e - - # If we get here all retries have failed. - assert last_exc is not None - raise last_exc - - @staticmethod - def has_unfinished_dp(dp_group: "ProcessGroup", - has_unfinished: bool) -> bool: - tensor = torch.tensor([has_unfinished], - dtype=torch.int32, - device="cpu") - # dp rank 0: has_unfinished_seqs=True - # dp rank 1: has_unfinished_seqs=False - # aggregated: has_unfinished_seqs=True - # so this is an OR operation, i.e. MAX in integers - torch.distributed.all_reduce(tensor, op=ReduceOp.MAX, group=dp_group) - aggregated_has_unfinished = bool(tensor.item()) - return aggregated_has_unfinished - - @staticmethod - def sync_kv_cache_memory_size(dp_group: "ProcessGroup", - kv_cache_memory: int) -> int: - if kv_cache_memory == -1: - kv_cache_memory = torch.iinfo(torch.int64).max - tensor = torch.tensor([kv_cache_memory], - dtype=torch.int64, - device="cpu") - # we cannot use broadcast for stateless dp group since it depends - # on global rank - torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group) - return tensor.item() - - def compute_hash(self): - """ - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - factors: list[Any] = [] - factors.append(self.pipeline_parallel_size) - factors.append(self.tensor_parallel_size) - factors.append(self.enable_expert_parallel) - factors.append(self.data_parallel_size) - factors.append(envs.VLLM_ALL2ALL_BACKEND) - return hashlib.sha256(str(factors).encode()).hexdigest() - - def __post_init__(self) -> None: - self.world_size = self.pipeline_parallel_size * \ - self.tensor_parallel_size - - if self.data_parallel_size_local > self.data_parallel_size: - raise ValueError( - f"data_parallel_size_local ({self.data_parallel_size_local}) " - f"must be <= data_parallel_size ({self.data_parallel_size})") - - if self.data_parallel_size > 1 or self.data_parallel_size_local == 0: - # Data parallel was specified in the engine args. - self.data_parallel_master_port = get_open_port() - - if not (0 <= self.data_parallel_rank < self.data_parallel_size): - raise ValueError( - f"data_parallel_rank ({self.data_parallel_rank})" - f" must be in the range [0, {self.data_parallel_size})") - else: - # Otherwise fall back to env vars (e.g. for offline SPMD case). - self.data_parallel_size = envs.VLLM_DP_SIZE - self.data_parallel_rank = envs.VLLM_DP_RANK - self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL - self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP - self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT - - if self.data_parallel_external_lb: - raise ValueError("data_parallel_external_lb can only " - "be set when data_parallel_size > 1") - - if self.distributed_executor_backend == "external_launcher": - import os - os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" - logger.info("Disabling V1 multiprocessing for external launcher.") - - if self.enable_eplb: - if not current_platform.is_cuda(): - raise ValueError( - "Expert parallelism load balancing is only supported on " - "CUDA devices now.") - if self.num_redundant_experts < 0: - raise ValueError( - "num_redundant_experts must be non-negative, but got " - f"{self.num_redundant_experts}.") - if not self.enable_expert_parallel: - raise ValueError( - "enable_expert_parallel must be True to use EPLB.") - if self.tensor_parallel_size * self.data_parallel_size <= 1: - raise ValueError( - "EPLB requires tensor_parallel_size or data_parallel_size " - f"to be greater than 1, but got " - f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}." - ) - else: - if self.num_redundant_experts != 0: - raise ValueError( - "num_redundant_experts should be used with EPLB." - f"{self.num_redundant_experts}.") - if self.distributed_executor_backend is None and self.world_size > 1: - # We use multiprocessing by default if world_size fits on the - # current node and we aren't in a ray placement group. - - from vllm.executor import ray_utils - backend: DistributedExecutorBackend = "mp" - ray_found = ray_utils.ray_is_available() - if current_platform.is_neuron(): - # neuron uses single process to control multiple devices - backend = "uni" - elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD: - backend = "uni" - elif (current_platform.is_cuda() - and cuda_device_count_stateless() < self.world_size): - if not ray_found: - raise ValueError("Unable to load Ray: " - f"{ray_utils.ray_import_err}. Ray is " - "required for multi-node inference, " - "please install Ray with `pip install " - "ray`.") - backend = "ray" - elif self.data_parallel_backend == "ray": - logger.info("Using ray distributed inference because " - "data_parallel_backend is ray") - backend = "ray" - elif ray_found: - if self.placement_group: - backend = "ray" - else: - from ray import is_initialized as ray_is_initialized - if ray_is_initialized(): - from ray.util import get_current_placement_group - if get_current_placement_group(): - backend = "ray" - self.distributed_executor_backend = backend - logger.debug("Defaulting to use %s for distributed inference", - backend) - - if self.distributed_executor_backend is None and self.world_size == 1: - self.distributed_executor_backend = "uni" - - @property - def use_ray(self) -> bool: - return self.distributed_executor_backend == "ray" or ( - isinstance(self.distributed_executor_backend, type) - and self.distributed_executor_backend.uses_ray) - - @model_validator(mode='after') - def _verify_args(self) -> Self: - # Lazy import to avoid circular import - from vllm.executor.executor_base import ExecutorBase - from vllm.platforms import current_platform - if self.distributed_executor_backend not in ( - "ray", "mp", "uni", - "external_launcher", None) and not (isinstance( - self.distributed_executor_backend, type) and issubclass( - self.distributed_executor_backend, ExecutorBase)): - raise ValueError( - "Unrecognized distributed executor backend " - f"{self.distributed_executor_backend}. Supported " - "values are 'ray', 'mp' 'uni', 'external_launcher' or" - " custom ExecutorBase subclass.") - if self.use_ray: - from vllm.executor import ray_utils - ray_utils.assert_ray_available() - - if not current_platform.use_custom_allreduce(): - self.disable_custom_all_reduce = True - logger.debug( - "Disabled the custom all-reduce kernel because it is not " - "supported on current platform.") - if self.ray_workers_use_nsight and not self.use_ray: - raise ValueError("Unable to use nsight profiling unless workers " - "run with Ray.") - - return self - - PreemptionMode = Literal["swap", "recompute"] SchedulerPolicy = Literal["fcfs", "priority"] diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index c1b3a61217..8a78d811b9 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -16,7 +16,7 @@ from vllm.logger import init_logger from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname if TYPE_CHECKING: - from vllm.config.config import VllmConfig + from vllm.config import VllmConfig else: VllmConfig = object diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py new file mode 100644 index 0000000000..bac1e63800 --- /dev/null +++ b/vllm/config/parallel.py @@ -0,0 +1,375 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from dataclasses import field +from typing import TYPE_CHECKING, Any, Literal, Optional, Union + +import torch +from pydantic import model_validator +from pydantic.dataclasses import dataclass +from torch.distributed import ProcessGroup, ReduceOp +from typing_extensions import Self + +import vllm.envs as envs +from vllm.config.utils import config +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils import cuda_device_count_stateless, get_open_port + +if TYPE_CHECKING: + from ray.runtime_env import RuntimeEnv + from ray.util.placement_group import PlacementGroup + + from vllm.executor.executor_base import ExecutorBase +else: + RuntimeEnv = Any + PlacementGroup = Any + ExecutorBase = Any + +logger = init_logger(__name__) + +DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"] + + +@config +@dataclass +class ParallelConfig: + """Configuration for the distributed execution.""" + + pipeline_parallel_size: int = 1 + """Number of pipeline parallel groups.""" + tensor_parallel_size: int = 1 + """Number of tensor parallel groups.""" + data_parallel_size: int = 1 + """Number of data parallel groups. MoE layers will be sharded according to + the product of the tensor parallel size and data parallel size.""" + data_parallel_size_local: int = 1 + """Number of local data parallel groups.""" + data_parallel_rank: int = 0 + """Rank of the data parallel group.""" + data_parallel_rank_local: Optional[int] = None + """Local rank of the data parallel group, + set only in SPMD mode.""" + data_parallel_master_ip: str = "127.0.0.1" + """IP of the data parallel master.""" + data_parallel_rpc_port: int = 29550 + """Port for data parallel messaging.""" + data_parallel_master_port: int = 29500 + """Port of the data parallel master.""" + data_parallel_backend: str = "mp" + """Backend to use for data parallel, either "mp" or "ray".""" + data_parallel_external_lb: bool = False + """Whether to use "external" DP LB mode. Applies only to online serving + and when data_parallel_size > 0. This is useful for a "one-pod-per-rank" + wide-EP setup in Kuberentes. Set implicitly when --data-parallel-rank + is provided explicitly to vllm serve.""" + data_parallel_hybrid_lb: bool = False + """Whether to use "hybrid" DP LB mode. Applies only to online serving + and when data_parallel_size > 0. Enables running an AsyncLLM + and API server on a "per-node" basis where vLLM load balances + between local data parallel ranks, but an external LB balances + between vLLM nodes/replicas. Set explicitly in conjunction with + --data-parallel-start-rank.""" + enable_expert_parallel: bool = False + """Use expert parallelism instead of tensor parallelism for MoE layers.""" + enable_eplb: bool = False + """Enable expert parallelism load balancing for MoE layers.""" + num_redundant_experts: int = 0 + """Number of redundant experts to use for expert parallelism.""" + eplb_window_size: int = 1000 + """Window size for expert load recording.""" + eplb_step_interval: int = 3000 + """ + Interval for rearranging experts in expert parallelism. + + Note that if this is greater than the EPLB window size, only the metrics + of the last `eplb_window_size` steps will be used for rearranging experts. + """ + eplb_log_balancedness: bool = False + """ + Log the balancedness each step of expert parallelism. + This is turned off by default since it will cause communication overhead. + """ + + max_parallel_loading_workers: Optional[int] = None + """Maximum number of parallel loading workers when loading model + sequentially in multiple batches. To avoid RAM OOM when using tensor + parallel and large models.""" + + disable_custom_all_reduce: bool = False + """Disable the custom all-reduce kernel and fall back to NCCL.""" + + ray_workers_use_nsight: bool = False + """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.""" + + ray_runtime_env: Optional[RuntimeEnv] = None + """Ray runtime environment to pass to distributed workers.""" + + placement_group: Optional[PlacementGroup] = None + """ray distributed model workers placement group.""" + + distributed_executor_backend: Optional[Union[DistributedExecutorBackend, + type[ExecutorBase]]] = None + """Backend to use for distributed model + workers, either "ray" or "mp" (multiprocessing). If the product + of pipeline_parallel_size and tensor_parallel_size is less than + or equal to the number of GPUs available, "mp" will be used to + keep processing on a single host. Otherwise, this will default + to "ray" if Ray is installed and fail otherwise. Note that tpu + only support Ray for distributed inference.""" + + worker_cls: str = "auto" + """The full name of the worker class to use. If "auto", the worker class + will be determined based on the platform.""" + sd_worker_cls: str = "auto" + """The full name of the worker class to use for speculative decoding. + If "auto", the worker class will be determined based on the platform.""" + worker_extension_cls: str = "" + """The full name of the worker extension class to use. The worker extension + class is dynamically inherited by the worker class. This is used to inject + new attributes and methods to the worker class for use in collective_rpc + calls.""" + + world_size: int = field(init=False) + """world_size is TPxPP, it affects the number of workers we create.""" + + rank: int = 0 + """Global rank in distributed setup.""" + + enable_multimodal_encoder_data_parallel: bool = False + """ Use data parallelism instead of tensor parallelism for vision encoder. + Only support LLama4 for now""" + + @property + def world_size_across_dp(self) -> int: + """world_size_across_dp is TPxPPxDP, it is the size of the world + including data parallelism.""" + return self.world_size * self.data_parallel_size + + def get_next_dp_init_port(self) -> int: + """ + We might need to initialize process groups in multiple + processes that is related to data parallelism, + e.g. both in the worker and in the engine, which + can live in different processes. To avoid port conflicts, we + increment the port number each time we need to initialize a + new process group related to data parallelism. + """ + answer = self.data_parallel_master_port + self.data_parallel_master_port += 1 + return answer + + def stateless_init_dp_group(self) -> ProcessGroup: + # NOTE: In high-concurrency scenarios multiple processes + # can pick the same (currently free) port through a race + # condition when calling `get_open_port()`. When the first + # process binds the port the others will subsequently fail + # with `torch.distributed.DistNetworkError: EADDRINUSE`. + # To make the initialization more robust we retry a few times + # with a fresh port whenever this specific error is observed. + from torch.distributed import DistNetworkError + + from vllm.distributed.utils import ( + stateless_init_torch_distributed_process_group) + + max_retries = 5 + last_exc: Optional[Exception] = None + for _ in range(max_retries): + try: + # use gloo since the engine process might not have cuda device + return stateless_init_torch_distributed_process_group( + self.data_parallel_master_ip, + self.get_next_dp_init_port(), + self.data_parallel_rank, + self.data_parallel_size, + backend="gloo") + except DistNetworkError as e: + # We only want to retry when the root cause is EADDRINUSE. + if "EADDRINUSE" in str(e): + logger.warning( + "Address already in use. Retrying with a new port.") + last_exc = e + continue # try again with a new port + raise e + + # If we get here all retries have failed. + assert last_exc is not None + raise last_exc + + @staticmethod + def has_unfinished_dp(dp_group: ProcessGroup, + has_unfinished: bool) -> bool: + tensor = torch.tensor([has_unfinished], + dtype=torch.int32, + device="cpu") + # dp rank 0: has_unfinished_seqs=True + # dp rank 1: has_unfinished_seqs=False + # aggregated: has_unfinished_seqs=True + # so this is an OR operation, i.e. MAX in integers + torch.distributed.all_reduce(tensor, op=ReduceOp.MAX, group=dp_group) + aggregated_has_unfinished = bool(tensor.item()) + return aggregated_has_unfinished + + @staticmethod + def sync_kv_cache_memory_size(dp_group: ProcessGroup, + kv_cache_memory: int) -> int: + if kv_cache_memory == -1: + kv_cache_memory = torch.iinfo(torch.int64).max + tensor = torch.tensor([kv_cache_memory], + dtype=torch.int64, + device="cpu") + # we cannot use broadcast for stateless dp group since it depends + # on global rank + torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group) + return tensor.item() + + def compute_hash(self): + """ + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: list[Any] = [] + factors.append(self.pipeline_parallel_size) + factors.append(self.tensor_parallel_size) + factors.append(self.enable_expert_parallel) + factors.append(self.data_parallel_size) + factors.append(envs.VLLM_ALL2ALL_BACKEND) + return hashlib.sha256(str(factors).encode()).hexdigest() + + def __post_init__(self) -> None: + self.world_size = self.pipeline_parallel_size * \ + self.tensor_parallel_size + + if self.data_parallel_size_local > self.data_parallel_size: + raise ValueError( + f"data_parallel_size_local ({self.data_parallel_size_local}) " + f"must be <= data_parallel_size ({self.data_parallel_size})") + + if self.data_parallel_size > 1 or self.data_parallel_size_local == 0: + # Data parallel was specified in the engine args. + self.data_parallel_master_port = get_open_port() + + if not (0 <= self.data_parallel_rank < self.data_parallel_size): + raise ValueError( + f"data_parallel_rank ({self.data_parallel_rank})" + f" must be in the range [0, {self.data_parallel_size})") + else: + # Otherwise fall back to env vars (e.g. for offline SPMD case). + self.data_parallel_size = envs.VLLM_DP_SIZE + self.data_parallel_rank = envs.VLLM_DP_RANK + self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL + self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP + self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT + + if self.data_parallel_external_lb: + raise ValueError("data_parallel_external_lb can only " + "be set when data_parallel_size > 1") + + if self.distributed_executor_backend == "external_launcher": + import os + os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" + logger.info("Disabling V1 multiprocessing for external launcher.") + + if self.enable_eplb: + if not current_platform.is_cuda(): + raise ValueError( + "Expert parallelism load balancing is only supported on " + "CUDA devices now.") + if self.num_redundant_experts < 0: + raise ValueError( + "num_redundant_experts must be non-negative, but got " + f"{self.num_redundant_experts}.") + if not self.enable_expert_parallel: + raise ValueError( + "enable_expert_parallel must be True to use EPLB.") + if self.tensor_parallel_size * self.data_parallel_size <= 1: + raise ValueError( + "EPLB requires tensor_parallel_size or data_parallel_size " + f"to be greater than 1, but got " + f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}." + ) + else: + if self.num_redundant_experts != 0: + raise ValueError( + "num_redundant_experts should be used with EPLB." + f"{self.num_redundant_experts}.") + if self.distributed_executor_backend is None and self.world_size > 1: + # We use multiprocessing by default if world_size fits on the + # current node and we aren't in a ray placement group. + + from vllm.executor import ray_utils + backend: DistributedExecutorBackend = "mp" + ray_found = ray_utils.ray_is_available() + if current_platform.is_neuron(): + # neuron uses single process to control multiple devices + backend = "uni" + elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD: + backend = "uni" + elif (current_platform.is_cuda() + and cuda_device_count_stateless() < self.world_size): + if not ray_found: + raise ValueError("Unable to load Ray: " + f"{ray_utils.ray_import_err}. Ray is " + "required for multi-node inference, " + "please install Ray with `pip install " + "ray`.") + backend = "ray" + elif self.data_parallel_backend == "ray": + logger.info("Using ray distributed inference because " + "data_parallel_backend is ray") + backend = "ray" + elif ray_found: + if self.placement_group: + backend = "ray" + else: + from ray import is_initialized as ray_is_initialized + if ray_is_initialized(): + from ray.util import get_current_placement_group + if get_current_placement_group(): + backend = "ray" + self.distributed_executor_backend = backend + logger.debug("Defaulting to use %s for distributed inference", + backend) + + if self.distributed_executor_backend is None and self.world_size == 1: + self.distributed_executor_backend = "uni" + + @property + def use_ray(self) -> bool: + return self.distributed_executor_backend == "ray" or ( + isinstance(self.distributed_executor_backend, type) + and self.distributed_executor_backend.uses_ray) + + @model_validator(mode='after') + def _verify_args(self) -> Self: + # Lazy import to avoid circular import + from vllm.executor.executor_base import ExecutorBase + from vllm.platforms import current_platform + if self.distributed_executor_backend not in ( + "ray", "mp", "uni", + "external_launcher", None) and not (isinstance( + self.distributed_executor_backend, type) and issubclass( + self.distributed_executor_backend, ExecutorBase)): + raise ValueError( + "Unrecognized distributed executor backend " + f"{self.distributed_executor_backend}. Supported " + "values are 'ray', 'mp' 'uni', 'external_launcher' or" + " custom ExecutorBase subclass.") + if self.use_ray: + from vllm.executor import ray_utils + ray_utils.assert_ray_available() + + if not current_platform.use_custom_allreduce(): + self.disable_custom_all_reduce = True + logger.debug( + "Disabled the custom all-reduce kernel because it is not " + "supported on current platform.") + if self.ray_workers_use_nsight and not self.use_ray: + raise ValueError("Unable to use nsight profiling unless workers " + "run with Ray.") + + return self From 5a16fa614c78e1f401125cd7384c602f83cb2160 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Sat, 9 Aug 2025 18:56:25 +0200 Subject: [PATCH 125/932] [Model] Gemma3n MM (#20495) Signed-off-by: ShriKode Signed-off-by: NickLucche Signed-off-by: Roger Wang Co-authored-by: ShriKode Co-authored-by: Roger Wang --- docs/models/supported_models.md | 15 +- examples/offline_inference/audio_language.py | 20 + examples/offline_inference/vision_language.py | 27 + requirements/test.in | 2 +- requirements/test.txt | 2 +- .../multimodal/processing/test_common.py | 5 +- tests/models/registry.py | 4 +- tests/test_test.py | 61 ++ vllm/model_executor/models/gemma3n.py | 79 +- vllm/model_executor/models/gemma3n_mm.py | 700 ++++++++++++++++++ vllm/model_executor/models/registry.py | 4 +- 11 files changed, 864 insertions(+), 55 deletions(-) create mode 100644 tests/test_test.py create mode 100644 vllm/model_executor/models/gemma3n_mm.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 19186a0635..5c48998ba4 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -349,7 +349,7 @@ th { | `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Gemma3nForConditionalGeneration` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | +| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | | `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4MoeForCausalLM` | GLM-4.5 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ | @@ -412,9 +412,6 @@ th { !!! note Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. -!!! note - Only text inputs are currently supported for `Gemma3nForConditionalGeneration`. To use this model, please upgrade Hugging Face Transformers to version 4.53.0. - ### Pooling Models See [this page](./pooling_models.md) for more information on how to use pooling models. @@ -608,6 +605,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | | | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ | | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ | +| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | | `GLM4VForCausalLM`^ | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ | @@ -677,6 +675,15 @@ Some models are supported only via the [Transformers backend](#transformers). Th This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends. +!!! note + `Gemma3nForConditionalGeneration` is only supported on V1 due to shared KV caching and it depends on `timm>=1.0.17` to make use of its + MobileNet-v5 vision backbone. + + Performance is not yet fully optimized mainly due to: + + - Both audio and vision MM encoders use `transformers.AutoModel` implementation. + - There's no PLE caching or out-of-memory swapping support, as described in [Google's blog](https://developers.googleblog.com/en/introducing-gemma-3n/). These features might be too model-specific for vLLM, and swapping in particular may be better suited for constrained setups. + !!! note Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently. diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index 01d6a188be..22cb8b057d 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -96,6 +96,25 @@ def run_voxtral(question: str, audio_count: int) -> ModelRequestData: ) +# Gemma3N +def run_gemma3n(question: str, audio_count: int) -> ModelRequestData: + model_name = "google/gemma-3n-E2B-it" + engine_args = EngineArgs( + model=model_name, + max_model_len=2048, + max_num_batched_tokens=2048, + max_num_seqs=2, + limit_mm_per_prompt={"audio": audio_count}, + enforce_eager=True, + ) + prompt = f"user\n{question}" + "\nmodel\n" + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + ) + + # Granite Speech def run_granite_speech(question: str, audio_count: int) -> ModelRequestData: # NOTE - the setting in this example are somehat different than what is @@ -331,6 +350,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData: model_example_map = { "voxtral": run_voxtral, + "gemma3n": run_gemma3n, "granite_speech": run_granite_speech, "minicpmo": run_minicpmo, "phi4_mm": run_phi4mm, diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 1314d33e90..5b3f0d2dc2 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -211,7 +211,33 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData: ) for question in questions ] + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + +# Gemma3N +def run_gemma3n(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + model_name = "google/gemma-3n-E2B-it" + + engine_args = EngineArgs( + model=model_name, + max_model_len=2048, + max_num_seqs=2, + limit_mm_per_prompt={modality: 1}, + enforce_eager=True, + ) + + prompts = [ + ( + "user\n" + f"{question}\n" + "model\n" + ) + for question in questions + ] return ModelRequestData( engine_args=engine_args, prompts=prompts, @@ -1395,6 +1421,7 @@ model_example_map = { "florence2": run_florence2, "fuyu": run_fuyu, "gemma3": run_gemma3, + "gemma3n": run_gemma3n, "glm4v": run_glm4v, "glm4_1v": run_glm4_1v, "h2ovl_chat": run_h2ovl, diff --git a/requirements/test.in b/requirements/test.in index ca22fd1551..6652bfdfe6 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -21,7 +21,7 @@ ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline paralleli sentence-transformers # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests -timm # required for internvl test +timm >=1.0.17 # required for internvl and gemma3n-mm test torch==2.7.1 torchaudio==2.7.1 torchvision==0.22.1 diff --git a/requirements/test.txt b/requirements/test.txt index 377eeb58c4..ff9886a315 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1051,7 +1051,7 @@ tiktoken==0.7.0 # via # lm-eval # mistral-common -timm==1.0.15 +timm==1.0.17 # via # -r requirements/test.in # open-clip-torch diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index bd1c55d95d..906966ddd0 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -271,6 +271,7 @@ def _test_processing_correctness_one( "microsoft/Florence-2-base", "adept/fuyu-8b", "google/gemma-3-4b-it", + "google/gemma-3n-E2B-it", "zai-org/glm-4v-9b", "zai-org/GLM-4.1V-9B-Thinking", "ibm-granite/granite-speech-3.3-2b", @@ -315,7 +316,7 @@ def _test_processing_correctness_one( "fixie-ai/ultravox-v0_5-llama-3_2-1b", "openai/whisper-large-v3", "omni-research/Tarsier-7b", - "omni-research/Tarsier2-Recap-7b" + "omni-research/Tarsier2-Recap-7b", ]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) @@ -327,6 +328,8 @@ def test_processing_correctness( num_batches: int, simplify_rate: float, ): + if model_id == "google/gemma-3n-E2B-it": + pytest.skip("Skipping gemma-3n-E2B-it due to transformers #39911 bug.") _test_processing_correctness( model_id, hit_rate=hit_rate, diff --git a/tests/models/registry.py b/tests/models/registry.py index 09d62413fe..e0939d1a20 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -186,7 +186,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"), "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"), "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"), - "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it", # noqa: E501 + "Gemma3nForCausalLM": _HfExamplesInfo("google/gemma-3n-E2B-it", min_transformers_version="4.53"), "GlmForCausalLM": _HfExamplesInfo("zai-org/glm-4-9b-chat-hf"), "Glm4ForCausalLM": _HfExamplesInfo("zai-org/GLM-4-9B-0414"), @@ -391,6 +391,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"), "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"), "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"), + "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it", # noqa: E501 + min_transformers_version="4.53"), "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-2b"), # noqa: E501 "GLM4VForCausalLM": _HfExamplesInfo("zai-org/glm-4v-9b", trust_remote_code=True, diff --git a/tests/test_test.py b/tests/test_test.py new file mode 100644 index 0000000000..dc8c9814ed --- /dev/null +++ b/tests/test_test.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from vllm import LLM, envs +from vllm.sampling_params import SamplingParams + +if not envs.VLLM_USE_V1: + pytest.skip( + "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.", + allow_module_level=True, + ) + + +@pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"]) +# TODO TPU will appear busy if we fan-out test params here +@pytest.mark.parametrize("n_prompts", [1]) +def test_logprobs(model_name: str, n_prompts: int): + """ + Request top logprobs with different sampling settings and check + that results contains the requested number, ordered ascendingly. + """ + + def check_num_logprobs(logprobs, expected_num: int): + for step in logprobs: + prev_logp = 1.0 + # order by rank + sorted_step = dict( + sorted(step.items(), key=lambda item: item[1].rank)) + + if len(step) != expected_num: + print("watch out", sorted_step) + + # check results are ordered by prob value + # assert len(step) == expected_num + for rankno, (tid, logp) in enumerate(sorted_step.items()): + assert logp.logprob <= prev_logp + prev_logp = logp.logprob + assert logp.rank == rankno + 1 + + llm = LLM(model_name, + enforce_eager=False, + max_num_seqs=1, + max_model_len=128, + max_num_batched_tokens=128) + prompts = [ + "Write a short story about a robot that dreams for the first time." + ] * n_prompts + greedy_sampling_params = SamplingParams(temperature=0.0, max_tokens=64,\ + logprobs=4) + regular_sampling_params = SamplingParams(temperature=0.4, max_tokens=64,\ + logprobs=4) + topkp_sampling_params = SamplingParams(temperature=0.4, max_tokens=64,\ + logprobs=4, top_k=12, top_p=0.5) + + for sp in [greedy_sampling_params, regular_sampling_params, \ + topkp_sampling_params]: + output = llm.generate(prompts, sp) + for o in output: + check_num_logprobs(o.outputs[0].logprobs, 4) diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index e16c03c8d3..4b41cba1c7 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -331,14 +331,15 @@ class Gemma3nAttention(nn.Module): config.num_kv_shared_layers) self.is_kv_shared = layer_idx >= first_kv_shared_layer_idx + kv_sharing_target_layer_name = None if self.is_kv_shared: # Last full attention layer is 1 before sharing # Last sliding attention layer is 2 before sharing offset = 2 if self.sliding_window is not None else 1 kv_shared_layer_index = first_kv_shared_layer_idx - offset - kv_sharing_target_layer_name = f"model.language_model.layers.{kv_shared_layer_index}.self_attn.attn" # noqa: E501 - else: - kv_sharing_target_layer_name = None + if kv_shared_layer_index >= 0: + # Only the greater layer is required to specify sharing. + kv_sharing_target_layer_name = f"language_model.model.layers.{kv_shared_layer_index}.self_attn.attn" # noqa: E501 self.rotary_emb = get_rope( self.head_dim, @@ -396,6 +397,7 @@ class Gemma3nDecoderLayer(nn.Module): prefix: str = "", ) -> None: super().__init__() + assert isinstance(config, Gemma3nTextConfig) self.altup_active_idx = config.altup_active_idx assert config.altup_correct_scale @@ -537,7 +539,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config = vllm_config.model_config.hf_config.text_config + config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config @@ -553,6 +555,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant): config.hidden_size**0.5, dtype=self.embed_tokens.weight.dtype, ) + # Additional per-layer embeddings (PLE) self.embed_tokens_per_layer = VocabParallelEmbedding( config.vocab_size_per_layer_input, config.num_hidden_layers * config.hidden_size_per_layer_input, @@ -636,6 +639,8 @@ class Gemma3nTextModel(nn.Module, SupportsQuant): self, input_ids: Optional[torch.Tensor], positions: torch.Tensor, + per_layer_inputs: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs, ) -> Union[torch.Tensor, IntermediateTensors]: @@ -644,13 +649,6 @@ class Gemma3nTextModel(nn.Module, SupportsQuant): else: hidden_states_0 = self.get_input_embeddings(input_ids) - # Per layer inputs. - if input_ids is None: - raise ValueError("Passing None for input ids is not supported.") - per_layer_inputs = self.get_per_layer_input_embeddings(input_ids) - per_layer_inputs = per_layer_inputs.reshape( - -1, self.config.num_hidden_layers, - self.config.hidden_size_per_layer_input) per_layer_projection = self.per_layer_model_projection(hidden_states_0) per_layer_projection = per_layer_projection.reshape( *hidden_states_0.shape[:-1], @@ -659,8 +657,13 @@ class Gemma3nTextModel(nn.Module, SupportsQuant): ) per_layer_projection = self.per_layer_projection_norm( per_layer_projection) - per_layer_inputs = per_layer_projection + per_layer_inputs - per_layer_inputs *= self.per_layer_input_scale + + if per_layer_inputs is not None: + # Profiling run does not compute per_layer_inputs + per_layer_inputs = per_layer_projection + per_layer_inputs + per_layer_inputs *= self.per_layer_input_scale + else: + per_layer_inputs = per_layer_projection # Altup embed. hidden_states = [hidden_states_0] * self.config.altup_num_inputs @@ -760,29 +763,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant): return loaded_params -class Gemma3nModel(nn.Module): - - def __init__(self, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - self.language_model = Gemma3nTextModel(vllm_config=vllm_config, - prefix=maybe_prefix( - prefix, "language_model")) - - def forward( - self, - input_ids: Optional[torch.Tensor], - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - **kwargs, - ) -> torch.Tensor: - return self.language_model(input_ids=input_ids, - positions=positions, - inputs_embeds=inputs_embeds, - **kwargs) - - -class Gemma3nForConditionalGeneration(nn.Module, SupportsQuant): +class Gemma3nForCausalLM(nn.Module): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -802,25 +783,33 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsQuant): super().__init__() self.config = config self.cache_config = vllm_config.cache_config - self.model = Gemma3nModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) + self.model = Gemma3nTextModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.logits_processor = LogitsProcessor( - config.text_config.vocab_size, - soft_cap=config.text_config.final_logit_softcapping) + config.vocab_size, soft_cap=config.final_logit_softcapping) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.language_model.get_input_embeddings(input_ids) + return self.model.get_input_embeddings(input_ids) def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, + *, + per_layer_inputs: Optional[torch.Tensor] = None, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs, ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = self.model(input_ids, positions, intermediate_tensors, - inputs_embeds, **kwargs) + + hidden_states = self.model( + input_ids, + positions, + per_layer_inputs=per_layer_inputs, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + **kwargs, + ) return hidden_states def compute_logits( @@ -828,8 +817,8 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsQuant): hidden_states: torch.Tensor, sampling_metadata: Optional[SamplingMetadata], ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.model.language_model.embed_tokens, - hidden_states, sampling_metadata) + logits = self.logits_processor(self.model.embed_tokens, hidden_states, + sampling_metadata) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py new file mode 100644 index 0000000000..a0c3bb5007 --- /dev/null +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -0,0 +1,700 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable, Mapping, Sequence +from typing import Any, Optional, TypedDict, Union, cast + +import torch +from torch import nn +from transformers import AutoModel, BatchFeature +from transformers.models.gemma3n import (Gemma3nAudioConfig, + Gemma3nAudioFeatureExtractor, + Gemma3nConfig, Gemma3nProcessor, + Gemma3nTextConfig, + Gemma3nVisionConfig) +from transformers.models.siglip import SiglipImageProcessorFast + +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import RowParallelLinear +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.model_executor.models.gemma3n import Gemma3nForCausalLM +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalKwargs) +from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems, + MultiModalDataParser) +# yapf: disable +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, BoundPromptUpdate, + PlaceholderFeaturesInfo, + PromptReplacement, PromptTargetMatch, + PromptUpdate, PromptUpdateDetails, + find_mm_placeholders, + replace_token_matches) +# yapf: enable +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors + +from .interfaces import MultiModalEmbeddings, SupportsMultiModal +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings) + +logger = init_logger(__name__) + +# This should be based on model config but we hardcode them for now. +TOKENS_PER_IMAGE = 256 +TOKENS_PER_AUDIO = 188 + + +class Gemma3nImagePixelInputs(TypedDict): + pixel_values: torch.Tensor + """Shape: `(batch_size * num_images, num_channels, height, width)`""" + + +class Gemma3nAudioInputs(TypedDict): + input_features: torch.Tensor + """Shape: `(batch_size * num_audio, seq_length, num_features)`""" + input_features_mask: torch.Tensor + """Shape: `(batch_size * num_audio, seq_length)`""" + + +Gemma3nImageInputs = Gemma3nImagePixelInputs + + +class Gemma3nProcessingInfo(BaseProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config(Gemma3nConfig) + + def get_hf_processor(self, **kwargs: object): + return self.ctx.get_hf_processor(Gemma3nProcessor, **kwargs) + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "audio": None} + + def get_max_tokens_per_item( + self, seq_len: int, + mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]: + + return {"image": TOKENS_PER_IMAGE, "audio": TOKENS_PER_AUDIO} + + def get_image_repl( + self, + *, + image_width: int, + image_height: int, + processor: Optional[Gemma3nProcessor], + ) -> str: + """ + Get the replacement text for image tokens. + + For Gemma3n, this should return the full_image_sequence which includes + BOI token, repeated image tokens, and EOI token. + """ + if processor is None: + processor = self.get_hf_processor() + + return PromptUpdateDetails.select_token_id( + processor.full_image_sequence, processor.image_token_id) + + def get_audio_repl( + self, + *, + processor: Optional[Gemma3nProcessor], + ) -> str: + """ + Get the replacement text for audio tokens. + + For Gemma3n, this should return the full_audio_sequence which includes + BOA token, repeated audio tokens, and EOA token. + """ + if processor is None: + processor = self.get_hf_processor() + + # Return the full audio sequence as defined by the processor + return PromptUpdateDetails.select_token_id( + processor.full_audio_sequence, processor.audio_token_id) + + +class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]): + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + num_audios = mm_counts.get("audio", 0) + + processor = self.info.get_hf_processor() + image_token = processor.image_token + audio_token = processor.audio_token + + return image_token * num_images + audio_token * num_audios + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + num_audios = mm_counts.get("audio", 0) + processor = self.info.get_hf_processor() + audio_feature_extractor: Gemma3nAudioFeatureExtractor = processor.feature_extractor # noqa: E501 + audio_len = audio_feature_extractor.fft_length + image_processor: SiglipImageProcessorFast = processor.image_processor + img_width = image_processor.size.get("width", 224) + img_height = image_processor.size.get("height", 224) + + return { + "image": + self._get_dummy_images(width=img_width, + height=img_height, + num_images=num_images), + "audio": + self._get_dummy_audios(length=audio_len, num_audios=num_audios) + } + + +class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo] + ): + + def _get_data_parser(self) -> MultiModalDataParser: + feature_extractor = self.info.get_hf_processor().feature_extractor + return MultiModalDataParser(target_sr=feature_extractor.sampling_rate) + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + + # HF Transformers audio processor no longer accepts `audios` key. + # We pop `audios` and replace it with `audio` key to surpress + # the warning. + if 'audios' in mm_data: + mm_data['audio'] = mm_data.pop('audios') + processed_outputs = super()._call_hf_processor( + prompt, + mm_data, + mm_kwargs, + tok_kwargs, + ) + if 'input_features' in processed_outputs: + # Avoid padding since we need the output of each item to be + # independent of other items for the cache to work correctly + unpadded_features = [ + f[mask] for f, mask in zip( + processed_outputs["input_features"], + processed_outputs["input_features_mask"], + ) + ] + processed_outputs["input_features"] = unpadded_features + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + + return dict(pixel_values=MultiModalFieldConfig.batched("image"), + input_features=MultiModalFieldConfig.batched("audio"), + input_features_mask=MultiModalFieldConfig.batched("audio")) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, Any], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + prompt_updates = [] + + # Handle image tokens + if "image" in mm_items: + image_token = hf_processor.image_token + + def get_replacement_image(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) + return self.info.get_image_repl( + image_width=image_size.width, + image_height=image_size.height, + processor=hf_processor, + ) + + prompt_updates.append( + PromptReplacement( + modality="image", + target=image_token, + replacement=get_replacement_image, + )) + + # Handle audio tokens + if "audio" in mm_items: + audio_token = hf_processor.audio_token + + def get_replacement_audio(item_idx: int): + return self.info.get_audio_repl(processor=hf_processor, ) + + prompt_updates.append( + PromptReplacement( + modality="audio", + target=audio_token, + replacement=get_replacement_audio, + )) + + return prompt_updates + + def _apply_token_matches( + self, + prompt: list[int], + mm_matches: Mapping[str, Sequence[PromptTargetMatch]], + mm_item_counts: Mapping[str, int], + ) -> list[int]: + token_ids = super()._apply_token_matches( + prompt, + mm_matches, + mm_item_counts, + ) + + # "\n\n\n" and "\n\n\n\n" are single tokens + # Since our replacement can insert "\n\n" next to "\n" + # tokens, we have to combine them to be consistent with + # the output of the tokenizer + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + newline_1 = vocab["\n"] + newline_2 = vocab["\n\n"] + newline_3 = vocab["\n\n\n"] + newline_4 = vocab["\n\n\n\n"] + + token_ids = replace_token_matches( + token_ids, + [newline_1, newline_2], + [newline_3], + ) + token_ids = replace_token_matches( + token_ids, + [newline_2, newline_1], + [newline_3], + ) + token_ids = replace_token_matches( + token_ids, + [newline_2, newline_2], + [newline_4], + ) + + return token_ids + + def _find_mm_placeholders( + self, + mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]], + new_token_ids: list[int], + mm_item_counts: Mapping[str, int], + ) -> Mapping[str, list[PlaceholderFeaturesInfo]]: + # We need to detect "\n\n" inside "\n\n\n" and "\n\n\n\n" + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + newline_1 = vocab["\n"] + newline_2 = vocab["\n\n"] + newline_3 = vocab["\n\n\n"] + newline_4 = vocab["\n\n\n\n"] + + def get_repl_toks(tok: int) -> list[int]: + if tok == newline_3: + return [newline_1, newline_2] + if tok == newline_4: + return [newline_2, newline_2] + + return [tok] + + repl_token_ids = list[int]() + repl_orig_idxs = list[int]() + for orig_idx, orig_tok in enumerate(new_token_ids): + repl_toks = get_repl_toks(orig_tok) + repl_token_ids.extend(repl_toks) + repl_orig_idxs.extend(orig_idx for _ in range(len(repl_toks))) + + repls = find_mm_placeholders(mm_prompt_updates, repl_token_ids, + mm_item_counts) + + return { + modality: [ + PlaceholderFeaturesInfo( + modality=p.modality, + item_idx=p.item_idx, + start_idx=repl_orig_idxs[p.start_idx], + tokens=p.tokens, + is_embed=p.is_embed, + ) for p in placeholders + ] + for modality, placeholders in repls.items() + } + + +class Gemma3nMultimodalEmbedder(nn.Module): + """Embeds token ids or soft tokens for multimodal content into language + model space.""" + + def __init__( + self, + multimodal_config: Union[Gemma3nAudioConfig, Gemma3nVisionConfig], + text_config: Gemma3nTextConfig, + ): + super().__init__() + + self.multimodal_hidden_size = multimodal_config.hidden_size + self.eps = multimodal_config.rms_norm_eps + self.vocab_offset = multimodal_config.vocab_offset + self.vocab_size = multimodal_config.vocab_size + self.text_hidden_size = text_config.hidden_size + + self.embedding = VocabParallelEmbedding( + self.vocab_size, + self.multimodal_hidden_size, + ) + + self.hard_embedding_norm = RMSNorm( + self.multimodal_hidden_size, + eps=self.eps, + ) + + self.soft_embedding_norm = RMSNorm( + self.multimodal_hidden_size, + eps=self.eps, + ) + + self.embedding_projection = RowParallelLinear( + self.multimodal_hidden_size, + self.text_hidden_size, + bias=False, + ) + + self.embedding_post_projection_norm = RMSNorm( + self.text_hidden_size, + eps=self.eps, + has_weight=False, + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Embeds token ids or soft tokens for multimodal content into language model space. + + Args: + input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range + `[vocab_offset, vocab_offset + vocab_size)`. + inputs_embeds: A torch.Tensor containing the soft tokens to embed. + + Returns: + A torch.Tensor of embeddings with shape `[batch_size, seq_len, self.config.text_config.hidden_size]`. + """ # noqa: E501 + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is not None: + emb_norm = self.soft_embedding_norm(inputs_embeds) + else: + hard_emb = self.embedding(input_ids - self.vocab_offset) + emb_norm = self.hard_embedding_norm(hard_emb) + + emb_norm_proj, _ = self.embedding_projection(emb_norm) + return self.embedding_post_projection_norm(emb_norm_proj) + + +@MULTIMODAL_REGISTRY.register_processor(Gemma3nMultiModalProcessor, + info=Gemma3nProcessingInfo, + dummy_inputs=Gemma3nDummyInputsBuilder) +class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + # mapping for new names in checkpoint saved after transformers v4.52 + "model.embed_audio.": "embed_audio.", + "model.embed_vision.": "embed_vision.", + "model.language_model.": "language_model.model.", + "model.vision_tower.": "vision_tower.", + "model.audio_tower.": "audio_tower.", + "model.multi_modal_projector.": "multi_modal_projector.", + "lm_head.": "language_model.lm_head.", + "model": "language_model.model", + }) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + self.config = config + self.quant_config = quant_config + self.multimodal_config = multimodal_config + self.vocab_size = config.text_config.vocab_size + + self.sliding_window = getattr(config.text_config, + "interleaved_sliding_window", None) + + self.vision_tower = AutoModel.from_config(config=config.vision_config) + self.audio_tower = AutoModel.from_config(config=config.audio_config) + self.embed_vision = Gemma3nMultimodalEmbedder(config.vision_config, + config.text_config) + self.embed_audio = Gemma3nMultimodalEmbedder(config.audio_config, + config.text_config) + + self.language_model: nn.Module = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + architectures=["Gemma3nForCausalLM"], + ) + self.language_model = cast(Gemma3nForCausalLM, self.language_model) + # NOTE (NickLucche) In order to be compatible with cudagraph, the + # buffer needs to be consistent, so we pre-allocate here. + self.per_layer_embeddings = torch.zeros( + vllm_config.scheduler_config.max_num_batched_tokens, + self.config.text_config.num_hidden_layers, + self.config.text_config.hidden_size_per_layer_input, + device=self.language_model.model.embed_tokens.weight.device, + dtype=self.language_model.model.embed_tokens.weight.dtype) + + @property + def dtype(self): + return next(self.parameters()).dtype + + def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: + # TODO check if there are any + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[Gemma3nImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + # TODO is this the case? + assert image_embeds is None, "Gemma3n does not support image_embeds." + if pixel_values is None: + return None + + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + pixel_values = flatten_bn(pixel_values, concat=True) + pixel_values = pixel_values.contiguous() + + return Gemma3nImagePixelInputs( + pixel_values=self._validate_pixel_values(pixel_values), ) + + def _parse_and_validate_audio_input( + self, **kwargs: object) -> Optional[Gemma3nAudioInputs]: + input_features = kwargs.pop("input_features", None) + if input_features is None: + return None + + input_features_mask = kwargs.pop("input_features_mask", None) + if input_features_mask is None: + return None + + return Gemma3nAudioInputs( + input_features=input_features, + input_features_mask=input_features_mask, + ) + + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + mm_input_by_modality = {} + + # Preserve the order of modalities if there are multiple of them + # from the order of kwargs. + for input_key in kwargs: + if input_key in ("pixel_values", "image_embeds" + ) and "image" not in mm_input_by_modality: + mm_input_by_modality[ + "image"] = self._parse_and_validate_image_input(**kwargs) + if input_key == "input_features" \ + and "audio" not in mm_input_by_modality: + mm_input_by_modality[ + "audio"] = self._parse_and_validate_audio_input(**kwargs) + return mm_input_by_modality + + def _process_image_input( + self, + image_input: Gemma3nImageInputs, + ) -> list[torch.Tensor]: + assert self.vision_tower is not None + + pixel_values = image_input["pixel_values"] + vision_outputs = self.vision_tower(pixel_values=pixel_values, + do_pooling=False, + return_dict=True).last_hidden_state + # TODO try to avoid copy here + # (batch, channels, height, width) to (batch, height * width, channels) + vision_outputs = vision_outputs.reshape( + vision_outputs.shape[0], + self.config.vision_config.hidden_size, + self.config.vision_soft_tokens_per_image, + ).permute(0, 2, 1).contiguous() + # Normalize and embed the soft tokens into language model space. + vision_outputs *= self.config.vision_config.hidden_size**0.5 + # Return a list of embeddings instead of a batched tensor + return self.embed_vision(inputs_embeds=vision_outputs).unbind(0) + + def _process_audio_input( + self, + audio_input: Gemma3nAudioInputs, + ) -> list[torch.Tensor]: + assert self.audio_tower is not None + input_features = audio_input["input_features"].squeeze(1) + input_features_mask = audio_input["input_features_mask"].squeeze(1) + audio_outputs, audio_mask = self.audio_tower(input_features, + ~input_features_mask) + audio_features = self.embed_audio(inputs_embeds=audio_outputs) + + # ruff: noqa + # The Gemma3nProcessor expects all audio will be 30s in length and inserts 188 audio soft tokens into the + # text to account for this. However, the audio preprocessing and encoder do not gurarantee they will + # produce 188 soft tokens; they will produce at most that many tokens, but they may produce fewer tokens + # depending on the length of the longest audio input in the batch. When we encounter this situation, we pad + # the audio feature out to 188 soft tokens with the emebedding of the last token in the embed_audio vocab. + # TODO precompute and cache padding + audio_padding_toks = torch.tensor([[self.vocab_size - 1]], + dtype=torch.long, + device=audio_features.device) + audio_padding_embs = self.embed_audio(input_ids=audio_padding_toks) + audio_features = torch.where(audio_mask.unsqueeze(-1), + audio_padding_embs, audio_features) + + audio_batch_size, audio_seq_len, audio_embed_dim = audio_features.shape + extra_padding_tokens = self.config.audio_soft_tokens_per_image - audio_seq_len # noqa: E501 + extra_padding_features = audio_padding_embs.expand( + audio_batch_size, extra_padding_tokens, audio_embed_dim) + + audio_features = torch.cat((audio_features, extra_padding_features), + dim=1) + # Return a list of embeddings instead of a batched tensor + return audio_features.unbind(0) + + def get_language_model(self) -> torch.nn.Module: + return self.language_model + + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: + mm_input_by_modality = self._parse_and_validate_multimodal_inputs( + **kwargs) + if mm_input_by_modality is None: + return [] + + multimodal_embeddings: list[torch.Tensor] = [] + + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + for modality in mm_input_by_modality: + multimodal_input = mm_input_by_modality[modality] + if modality == "image": + vision_embeddings = self._process_image_input(multimodal_input) + multimodal_embeddings.extend(vision_embeddings) + if modality == "audio": + audio_embeddings = self._process_audio_input(multimodal_input) + multimodal_embeddings.extend(audio_embeddings) + return multimodal_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + # NOTE (NickLucche) Each pass needs tokens to compute PLE so we cache + # them here, as the model forward has only access to the input_embeds. + if input_ids is not None: + per_layer_inputs = self.language_model.model.get_per_layer_input_embeddings( + input_ids) + per_layer_inputs = per_layer_inputs.reshape( + -1, self.config.text_config.num_hidden_layers, + self.config.text_config.hidden_size_per_layer_input) + self.per_layer_embeddings[:per_layer_inputs.shape[0]].copy_( + per_layer_inputs) + + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + multimodal_embeddings, + # NOTE: this order of processing mm items is important + [self.config.image_token_id, self.config.audio_token_id]) + return inputs_embeds + + def forward(self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object) -> IntermediateTensors: + if intermediate_tensors is not None: + inputs_embeds = None + + # NOTE (NickLucche) During profiling, `get_input_embeddings` is not + # called, hence we don't have input_ids to compute PLEs. We simply + # select a chunk of pre-allocated PLEs. During normal execution, + # `get_input_embeddings` is called before forward, hence this slice + # will contain PLEs computed from the actual input_ids. + per_layer_inputs = self.per_layer_embeddings[:inputs_embeds.shape[0]] + + hidden_states = self.language_model.model( + input_ids, + positions, + per_layer_inputs=per_layer_inputs, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + **kwargs) + + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="multi_modal_projector", + tower_model="vision_tower") + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + if modality == "image": + return "" + elif modality == "audio": + return "" + else: + raise ValueError(f"Unsupported modality: {modality}") diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 4aa958ecdc..3d8694e7b9 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -69,8 +69,7 @@ _TEXT_GENERATION_MODELS = { "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"), "Gemma3ForCausalLM": ("gemma3", "Gemma3ForCausalLM"), - #TODO(ywang96): Support multimodal gemma3n - "Gemma3nForConditionalGeneration": ("gemma3n", "Gemma3nForConditionalGeneration"), # noqa: E501 + "Gemma3nForCausalLM": ("gemma3n", "Gemma3nForCausalLM"), "GlmForCausalLM": ("glm", "GlmForCausalLM"), "Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"), "Glm4MoeForCausalLM": ("glm4_moe", "Glm4MoeForCausalLM"), @@ -205,6 +204,7 @@ _MULTIMODAL_MODELS = { "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"), "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501 + "Gemma3nForConditionalGeneration": ("gemma3n_mm", "Gemma3nForConditionalGeneration"), # noqa: E501 "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"), "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501 "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501 From fbd8595c5c6f969dfa6cf33e5a371d93d55025fb Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 10 Aug 2025 02:42:21 +0800 Subject: [PATCH 126/932] [Bugfix] Fix basic models tests hanging due to mm processor creation (#22571) Signed-off-by: Isotr0py --- vllm/multimodal/registry.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index a101f2a55f..ded56cca80 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -138,8 +138,8 @@ class MultiModalRegistry: if not model_config.is_multimodal_model: return False - processor = self.create_processor(model_config, disable_cache=False) - supported_modalities = processor.info.get_supported_mm_limits() + info = self._create_processing_info(model_config, tokenizer=None) + supported_modalities = info.get_supported_mm_limits() mm_config = model_config.get_multimodal_config() @@ -278,6 +278,26 @@ class MultiModalRegistry: model_cls, _ = get_model_architecture(model_config) return model_cls + def _create_processing_ctx( + self, + model_config: "ModelConfig", + tokenizer: Optional[AnyTokenizer] = None, + ) -> InputProcessingContext: + if tokenizer is None and not model_config.skip_tokenizer_init: + tokenizer = cached_tokenizer_from_config(model_config) + return InputProcessingContext(model_config, tokenizer) + + def _create_processing_info( + self, + model_config: "ModelConfig", + *, + tokenizer: Optional[AnyTokenizer] = None, + ) -> BaseProcessingInfo: + model_cls = self._get_model_cls(model_config) + factories = self._processor_factories[model_cls] + ctx = self._create_processing_ctx(model_config, tokenizer) + return factories.info(ctx) + def create_processor( self, model_config: "ModelConfig", @@ -291,15 +311,13 @@ class MultiModalRegistry: if not model_config.is_multimodal_model: raise ValueError(f"{model_config.model} is not a multimodal model") - if tokenizer is None and not model_config.skip_tokenizer_init: - tokenizer = cached_tokenizer_from_config(model_config) if disable_cache is None: disable_cache = not model_config.enable_mm_processor_cache model_cls = self._get_model_cls(model_config) factories = self._processor_factories[model_cls] - ctx = InputProcessingContext(model_config, tokenizer) + ctx = self._create_processing_ctx(model_config, tokenizer) cache = None if disable_cache else self._get_processor_cache( model_config) From 42172ad18fbc22c89d0063184e4570cc84186e16 Mon Sep 17 00:00:00 2001 From: TJian Date: Sat, 9 Aug 2025 11:50:03 -0700 Subject: [PATCH 127/932] [FEAT] [Performance] Add triton mrope to replace the torch code path (#22375) Signed-off-by: tjtanaa --- benchmarks/kernels/benchmark_mrope.py | 328 ++++++++++++++++++ tests/kernels/test_mrope.py | 207 +++++++++++ .../layers/rotary_embedding/mrope.py | 231 ++++++++++++ 3 files changed, 766 insertions(+) create mode 100644 benchmarks/kernels/benchmark_mrope.py create mode 100644 tests/kernels/test_mrope.py diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py new file mode 100644 index 0000000000..b914736170 --- /dev/null +++ b/benchmarks/kernels/benchmark_mrope.py @@ -0,0 +1,328 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# This script benchmarks the mrope kernel (mainly for Qwen2VL and Qwen2.5VL models). +# It generates test data, runs benchmarks, and saves results to a CSV file. +# +# The CSV file (named with current date/time) contains these columns: +# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position, +# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99, +# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max, +# speedup +# +# == Usage Examples == +# +# Single model benchmark: +# python3 benchmark_mrope.py --model-name Qwen/Qwen2-VL-7B-Instruct --tp-size 1 \ +# --warmup-iter 10 --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 +# +# All models benchmark: +# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \ +# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 +# +# All models with different TP sizes: +# python3 benchmark_mrope.py --model-name "" --tp-size 1 2 4 8 --warmup-iter 10 \ +# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 +# +# All models with different token counts: +# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \ +# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 4096 16384 +import csv +import os +import time +from datetime import datetime +from typing import Any + +import numpy as np +import torch + +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.platforms import current_platform +from vllm.transformers_utils.config import get_config +from vllm.utils import FlexibleArgumentParser + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def generate_test_data( + num_tokens: int, + num_q_heads: int, + num_kv_heads: int, + head_size: int, + max_position_embeddings: int, + dtype: torch.dtype, + device: torch.device, +): + """Generate test data for given configuration.""" + # Create 2D positions (3, num_tokens) for multimodal case + positions = torch.randint( + 0, max_position_embeddings // 4, (3, num_tokens), device=device + ) + + # Create query and key tensors + query = torch.randn(num_tokens, num_q_heads * head_size, dtype=dtype, device=device) + key = torch.randn(num_tokens, num_kv_heads * head_size, dtype=dtype, device=device) + + return positions, query, key + + +def calculate_stats(times: list[float]) -> dict[str, float]: + """Calculate statistics from a list of times.""" + times_array = np.array(times) + return { + "mean": np.mean(times_array), + "median": np.median(times_array), + "p99": np.percentile(times_array, 99), + "min": np.min(times_array), + "max": np.max(times_array), + } + + +def benchmark_mrope( + model_name: str, + num_tokens: int, + head_dim: int, + tp_size: int, + num_heads: int, + num_kv_heads: int, + max_position: int = 8192, + rope_theta: float = 10000, + is_neox_style: bool = True, + rope_scaling: dict[str, Any] = None, + dtype: torch.dtype = torch.bfloat16, + seed: int = 0, + warmup_iter: int = 10, + benchmark_iter: int = 100, + csv_writer=None, +): + current_platform.seed_everything(seed) + torch.set_default_device(device) + # the parameters to compute the q k v size based on tp_size + mrope_helper_class = get_rope( + head_size=head_dim, + rotary_dim=head_dim, + max_position=max_position, + base=rope_theta, + is_neox_style=is_neox_style, + rope_scaling=rope_scaling, + dtype=dtype, + ).to(device=device) + + print(80 * "=") + print( + f"Evaluating model: {model_name} " + f"with tp_size: {tp_size} " + f"and num_tokens: {num_tokens}, " + f"dtype: {dtype}" + ) + + # create q k v input tensors + # create rotary pos emb input tensors + positions, query, key = generate_test_data( + num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device + ) + + # Warm up + for _ in range(warmup_iter): + mrope_helper_class.forward_native( + positions, + query.clone(), + key.clone(), + ) + + mrope_helper_class.forward_cuda( + positions, + query.clone(), + key.clone(), + ) + + torch.cuda.synchronize() + + # Time reference implementation + torch_times = [] + for _ in range(benchmark_iter): + query_clone = query.clone() + key_clone = key.clone() + torch.cuda.synchronize() + start_time = time.time() + + mrope_helper_class.forward_native( + positions, + query_clone, + key_clone, + ) + + torch.cuda.synchronize() + torch_times.append(time.time() - start_time) + + # Time triton kernel implementation + triton_times = [] + for _ in range(benchmark_iter): + query_clone = query.clone() + key_clone = key.clone() + torch.cuda.synchronize() + start_time = time.time() + mrope_helper_class.forward_cuda( + positions, + query_clone, + key_clone, + ) + torch.cuda.synchronize() + triton_times.append(time.time() - start_time) + + # Calculate statistics + torch_stats = calculate_stats(torch_times) + triton_stats = calculate_stats(triton_times) + print(f"\nPerformance for config ({num_tokens}, {num_heads}, {num_kv_heads}):") + + print( + f"Torch implementation: " + f"mean={torch_stats['mean']:.8f}s, " + f"median={torch_stats['median']:.8f}s, " + f"p99={torch_stats['p99']:.8f}s" + ) + + print( + f"Triton implementation: " + f"mean={triton_stats['mean']:.8f}s, " + f"median={triton_stats['median']:.8f}s, " + f"p99={triton_stats['p99']:.8f}s" + ) + + print( + f"Triton Speedup over Torch: {torch_stats['mean'] / triton_stats['mean']:.8f}x" + ) + + # Write to CSV + if csv_writer: + row = [ + model_name, + tp_size, + num_tokens, + num_heads, + num_kv_heads, + head_dim, + max_position, + rope_theta, + is_neox_style, + str(rope_scaling), + str(dtype).split(".")[-1], + torch_stats["mean"], + torch_stats["median"], + torch_stats["p99"], + torch_stats["min"], + torch_stats["max"], + triton_stats["mean"], + triton_stats["median"], + triton_stats["p99"], + triton_stats["min"], + triton_stats["max"], + torch_stats["mean"] / triton_stats["mean"], # speedup + ] + csv_writer.writerow(row) + + return torch_stats, triton_stats + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the rotary embedding kernels." + ) + parser.add_argument("--model-name", type=str, default="") + parser.add_argument("--tp-size", type=int, default=1) + parser.add_argument("--warmup-iter", type=int, default=10) + parser.add_argument("--benchmark-iter", type=int, default=100) + parser.add_argument("--dtype", type=str, choices=["bfloat16"], default="bfloat16") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--num-tokens", type=int, nargs="+", required=False) + parser.add_argument("--trust-remote-code", action="store_true") + parser.add_argument("--output-csv", type=str, default="mrope_benchmark_results.csv") + args = parser.parse_args() + print(args) + + # Create CSV file for results + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + csv_filename = f"{os.path.splitext(args.output_csv)[0]}_{timestamp}.csv" + + with open(csv_filename, "w", newline="") as csvfile: + csv_writer = csv.writer(csvfile) + # Write header + header = [ + "model_name", + "tp_size", + "num_tokens", + "num_heads", + "num_kv_heads", + "head_dim", + "max_position", + "rope_theta", + "is_neox_style", + "rope_scaling", + "dtype", + "torch_mean", + "torch_median", + "torch_p99", + "torch_min", + "torch_max", + "triton_mean", + "triton_median", + "triton_p99", + "triton_min", + "triton_max", + "speedup", + ] + csv_writer.writerow(header) + + model_tp_dict = {} + if args.model_name == "": + model_tp_dict = { + "Qwen/Qwen2-VL-2B-Instruct": [1], + "Qwen/Qwen2-VL-7B-Instruct": [1], + "Qwen/Qwen2-VL-72B-Instruct": [2, 4, 8], + "Qwen/Qwen2.5-VL-3B-Instruct": [1, 2, 4, 8], + "Qwen/Qwen2.5-VL-7B-Instruct": [1, 2, 4, 8], + "Qwen/Qwen2.5-VL-72B-Instruct": [2, 4, 8], + } + else: + model_tp_dict[args.model_name] = [args.tp_size] + + if args.num_tokens is None: + num_tokens_list = [2**i for i in range(0, 18)] + else: + num_tokens_list = args.num_tokens + + for model_name, tp_list in model_tp_dict.items(): + config = get_config(model_name, trust_remote_code=args.trust_remote_code) + for tp_size in tp_list: + # get the model config + total_num_kv_heads = config.num_key_value_heads + total_num_heads = config.num_attention_heads + num_heads = total_num_heads // tp_size + num_kv_heads = max(1, total_num_kv_heads // tp_size) + head_dim = config.hidden_size // total_num_heads + q_size = num_heads * head_dim + kv_size = num_kv_heads * head_dim + is_neox_style = True + rope_theta = config.rope_theta + max_position = config.max_position_embeddings + + for num_tokens in num_tokens_list: + benchmark_mrope( + model_name=model_name, + num_tokens=num_tokens, + head_dim=head_dim, + tp_size=tp_size, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + max_position=max_position, + rope_theta=rope_theta, + is_neox_style=is_neox_style, + rope_scaling=config.rope_scaling, + dtype=getattr(torch, args.dtype), + seed=args.seed, + warmup_iter=args.warmup_iter, + benchmark_iter=args.benchmark_iter, + csv_writer=csv_writer, + ) + + print(f"Benchmark results saved to {csv_filename}") diff --git a/tests/kernels/test_mrope.py b/tests/kernels/test_mrope.py new file mode 100644 index 0000000000..5918b7a58b --- /dev/null +++ b/tests/kernels/test_mrope.py @@ -0,0 +1,207 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import torch +from transformers import AutoConfig + +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.platforms import current_platform + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int, + head_size: int, max_position_embeddings: int, + dtype: torch.dtype, device: torch.device): + """Generate test data for given configuration.""" + # Create 2D positions (3, num_tokens) for multimodal case + positions = torch.randint(0, + max_position_embeddings // 4, (3, num_tokens), + device=device) + + # Create query and key tensors + query = torch.randn(num_tokens, + num_q_heads * head_size, + dtype=dtype, + device=device) + key = torch.randn(num_tokens, + num_kv_heads * head_size, + dtype=dtype, + device=device) + + return positions, query, key + + +def unroll_model_tp_dict(model_tp_dict): + return [(model_name, tp_size) + for model_name, tp_sizes in model_tp_dict.items() + for tp_size in tp_sizes] + + +model_tp_dict = { + "Qwen/Qwen2-VL-7B-Instruct": [1, 2], + "Qwen/Qwen2-VL-72B-Instruct": [1, 2], + "Qwen/Qwen2.5-VL-72B-Instruct": [1, 2] +} + +# https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317 +dtype_atol_rtol_list = [ + [torch.bfloat16, 1e-5, 1.6e-2], +] + +num_tokens_list = [11, 8192] + + +@pytest.mark.skipif(not current_platform.is_cuda_alike(), + reason="Skipping CUDA/ROCm only tests.") +@pytest.mark.parametrize("model_name, tp_size", + unroll_model_tp_dict(model_tp_dict)) +@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list) +@pytest.mark.parametrize("num_tokens", num_tokens_list) +def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens): + + config = AutoConfig.from_pretrained(model_name) + + # get the model config + total_num_kv_heads = config.num_key_value_heads + total_num_heads = config.num_attention_heads + num_heads = total_num_heads // tp_size + num_kv_heads = max(1, total_num_kv_heads // tp_size) + head_dim = config.hidden_size // total_num_heads + is_neox_style = True + + rope_theta = config.rope_theta + max_position = config.max_position_embeddings + + mrope_helper_class = get_rope( + head_size=head_dim, + rotary_dim=head_dim, + max_position=max_position, + base=rope_theta, + is_neox_style=is_neox_style, + rope_scaling=config.rope_scaling, + dtype=dtype, + ).to(device=device) + + # create q k v input tensors + # create rotary pos emb input tensors + positions, query, key = generate_test_data(num_tokens, num_heads, + num_kv_heads, head_dim, + max_position, dtype, device) + + query_native, key_native = mrope_helper_class.forward_native( + positions, + query.clone(), + key.clone(), + ) + + query_cuda, key_cuda = mrope_helper_class.forward_cuda( + positions, + query.clone(), + key.clone(), + ) + + torch.testing.assert_close(query_native, query_cuda, atol=atol, rtol=rtol) + torch.testing.assert_close(key_native, key_cuda, atol=atol, rtol=rtol) + + +@pytest.mark.skipif(not current_platform.is_cuda_alike(), + reason="Skipping CUDA/ROCm only tests.") +@pytest.mark.parametrize( + "model_name, tp_size", + unroll_model_tp_dict({"Qwen/Qwen2-VL-7B-Instruct": [1, 2]})) +@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list) +@pytest.mark.parametrize("num_tokens", [4]) +def test_mrope_torch_compile_tracing(model_name, tp_size, dtype, atol, rtol, + num_tokens): + config = AutoConfig.from_pretrained(model_name) + + # get the model config + total_num_kv_heads = config.num_key_value_heads + total_num_heads = config.num_attention_heads + num_heads = total_num_heads // tp_size + num_kv_heads = max(1, total_num_kv_heads // tp_size) + head_dim = config.hidden_size // total_num_heads + is_neox_style = True + rope_theta = config.rope_theta + max_position = config.max_position_embeddings + + mrope_helper_class = get_rope( + head_size=head_dim, + rotary_dim=head_dim, + max_position=max_position, + base=rope_theta, + is_neox_style=is_neox_style, + rope_scaling=config.rope_scaling, + dtype=dtype, + ).to(device=device) + + # Generate test data + positions, query, key = generate_test_data(num_tokens, num_heads, + num_kv_heads, head_dim, + max_position, dtype, device) + + # Create a wrapper that makes the in-place function appear functional + def functional_forward_cuda(pos, q, k): + """Wrapper that converts in-place operation to functional style + + CUDA Graph does not support in-place operations. + This wrapper creates working copies of the + input tensors and modifies them. + """ + q_work = q.clone() # Create working copies + k_work = k.clone() + # Your in-place function modifies q_work and k_work + mrope_helper_class.forward_cuda(pos, q_work, k_work) + return q_work, k_work # Return the modified tensors + + # Get reference results + query_native, key_native = mrope_helper_class.forward_native( + positions, + query.clone(), + key.clone(), + ) + + try: + compiled_forward_cuda = torch.compile(functional_forward_cuda, + fullgraph=True, + backend="inductor", + mode="reduce-overhead", + dynamic=False) + + # Run compiled version + query_compiled_cuda, key_compiled_cuda = compiled_forward_cuda( + positions, + query, + key, + ) + + # Run original version for comparison + query_cuda = query.clone() + key_cuda = key.clone() + mrope_helper_class.forward_cuda(positions, query_cuda, key_cuda) + + # Verify results + torch.testing.assert_close(query_compiled_cuda, + query_cuda, + atol=atol, + rtol=rtol) + torch.testing.assert_close(key_compiled_cuda, + key_cuda, + atol=atol, + rtol=rtol) + torch.testing.assert_close(query_compiled_cuda, + query_native, + atol=atol, + rtol=rtol) + torch.testing.assert_close(key_compiled_cuda, + key_native, + atol=atol, + rtol=rtol) + + print("✓ forward_cuda successfully traced with torch.compile inductor") + + except Exception as e: + pytest.fail( + f"forward_cuda failed to trace with torch.compile inductor: {e}") diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index a75b9e5eb4..d3b71930b6 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -8,10 +8,173 @@ import numpy as np import torch from transformers import PretrainedConfig +from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton + from .base import RotaryEmbedding from .common import apply_rotary_emb_dispatch +@triton.jit +def _triton_qwen2vl_mrope_forward( + q_ptr, + k_ptr, + cos, + sin, + num_tokens, + n_qh: tl.constexpr, + n_kh: tl.constexpr, + hd: tl.constexpr, + pad_n_qh: tl.constexpr, + pad_n_kh: tl.constexpr, + pad_hd: tl.constexpr, + mrope_section_t: tl.constexpr, + mrope_section_h: tl.constexpr, +): + # Adapted from + # https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/qwen2vl_mrope.py + # This version supports flatten input tensors from vllm + # and supports cos and sin cache with shape (3, num_tokens, head_dim // 2) + # instead of (3, bsz, seq_len, head_dim) + pid = tl.program_id(0) + # locate start address + q_ptr = q_ptr + pid * (n_qh * hd) + k_ptr = k_ptr + pid * (n_kh * hd) + + # #################################################################### + # get the cos(mθ_{i...d/2}) and sin(mθ_{i...d/2}) for token position + # m of this program instance + # #################################################################### + # Note: cos and sin now have shape (3, num_tokens, head_dim // 2) + + t_end = mrope_section_t + h_end = t_end + mrope_section_h + + # Updated stride calculation for half head_dim + half_hd = hd // 2 + t_cos = cos + pid * half_hd + h_cos = t_cos + num_tokens * half_hd + w_cos = h_cos + num_tokens * half_hd + t_sin = sin + pid * half_hd + h_sin = t_sin + num_tokens * half_hd + w_sin = h_sin + num_tokens * half_hd + + # Updated offsets for half head_dim + cos_offsets = tl.arange(0, pad_hd // 2) + t_mask = cos_offsets < t_end + h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end) + w_mask = (h_end <= cos_offsets) & (cos_offsets < half_hd) + + t_cos_row = tl.load(t_cos + cos_offsets, mask=t_mask, other=0) + h_cos_row = tl.load(h_cos + cos_offsets, mask=h_mask, other=0) + w_cos_row = tl.load(w_cos + cos_offsets, mask=w_mask, other=0) + t_sin_row = tl.load(t_sin + cos_offsets, mask=t_mask, other=0) + h_sin_row = tl.load(h_sin + cos_offsets, mask=h_mask, other=0) + w_sin_row = tl.load(w_sin + cos_offsets, mask=w_mask, other=0) + + cos_row = t_cos_row + h_cos_row + w_cos_row + sin_row = t_sin_row + h_sin_row + w_sin_row + + # #################################################################### + # Load the left and right half of q and k for the current + # program instance (i.e. for the current token) separately + # #################################################################### + # left half of the head + first_half_q_offsets = tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange( + 0, pad_hd // 2)[None, :] + first_half_k_offsets = tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange( + 0, pad_hd // 2)[None, :] + first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (tl.arange( + 0, pad_hd // 2)[None, :] < hd // 2) + first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (tl.arange( + 0, pad_hd // 2)[None, :] < hd // 2) + + q_tile_1 = tl.load(q_ptr + first_half_q_offsets, + mask=first_q_mask, + other=0).to(sin_row.dtype) + k_tile_1 = tl.load(k_ptr + first_half_k_offsets, + mask=first_k_mask, + other=0).to(sin_row.dtype) + + # right half of the head + second_half_q_offsets = first_half_q_offsets + (hd // 2) + second_half_k_offsets = first_half_k_offsets + (hd // 2) + second_q_mask = first_q_mask + second_k_mask = first_k_mask + + q_tile_2 = tl.load(q_ptr + second_half_q_offsets, + mask=second_q_mask, + other=0).to(sin_row.dtype) + k_tile_2 = tl.load(k_ptr + second_half_k_offsets, + mask=second_k_mask, + other=0).to(sin_row.dtype) + + # y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin] + # Since cos and sin are now half-size, + # we use the same cos_row and sin_row for both halves + new_q_tile_1 = q_tile_1 * cos_row - q_tile_2 * sin_row + tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask) + new_q_tile_2 = q_tile_2 * cos_row + q_tile_1 * sin_row + tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask) + + new_k_tile_1 = k_tile_1 * cos_row - k_tile_2 * sin_row + tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask) + new_k_tile_2 = k_tile_2 * cos_row + k_tile_1 * sin_row + tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask) + + +def triton_mrope( + q: torch.Tensor, + k: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + mrope_section: list[int], + head_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + """Qwen2VL mrope kernel. + + Args: + query: [num_tokens, num_heads * head_size] + key: [num_tokens, num_kv_heads * head_size] + cos: [3, num_tokens, head_size //2 ] + (T/H/W positions with multimodal inputs) + sin: [3, num_tokens, head_size //2 ] + (T/H/W positions with multimodal inputs) + mrope_section: [t, h, w] + head_size: int + """ + n_row, n_q_head_head_dim = q.shape + n_q_head = n_q_head_head_dim // head_size + n_kv_head = k.shape[1] // head_size + pad_hd = triton.next_power_of_2(head_size) + pad_n_q_head = triton.next_power_of_2(n_q_head) + pad_n_kv_head = triton.next_power_of_2(n_kv_head) + + # ensure tensors passed into the kernel are contiguous. + # It will be no-op if they are already contiguous + q = q.contiguous() + k = k.contiguous() + cos = cos.contiguous() + sin = sin.contiguous() + + _triton_qwen2vl_mrope_forward[(n_row, )]( + q, + k, + cos, + sin, + n_row, + n_q_head, + n_kv_head, + head_size, + pad_n_q_head, + pad_n_kv_head, + pad_hd, + mrope_section[0], + mrope_section[1], + ) + return q, k + + class MRotaryEmbedding(RotaryEmbedding): """Rotary Embedding with Multimodal Sections.""" @@ -36,11 +199,34 @@ class MRotaryEmbedding(RotaryEmbedding): if self.mrope_section: assert sum(self.mrope_section) == rotary_dim // 2 + self.use_triton = current_platform.is_cuda_alike() + def forward( self, positions: torch.Tensor, query: torch.Tensor, key: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """MRope forward. + + Args: + positions: + [num_tokens,] (text only) or + [3, num_tokens] (T/H/W positions with multimodal inputs) + query: [num_tokens, num_heads * head_size] + key: [num_tokens, num_kv_heads * head_size] + """ + if self.use_triton: + return self.forward_cuda(positions, query, key) + else: + return self.forward_native(positions, query, key) + + def forward_native( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + offsets: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """PyTorch-native implementation equivalent to forward(). @@ -88,6 +274,51 @@ class MRotaryEmbedding(RotaryEmbedding): key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) return query, key + def forward_cuda( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + offsets: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + + assert positions.ndim == 1 or positions.ndim == 2 + assert key is not None + + num_tokens = positions.shape[-1] + cos_sin = self.cos_sin_cache[positions] + cos, sin = cos_sin.chunk(2, dim=-1) + query_shape = query.shape + key_shape = key.shape + if positions.ndim == 2: + assert self.mrope_section + + q, k = triton_mrope( + query, + key, + cos, + sin, + self.mrope_section, + self.head_size, + ) + + return q.reshape(query_shape), k.reshape(key_shape) + + query = query.view(num_tokens, -1, self.head_size) + query_rot = query[..., :self.rotary_dim] + query_pass = query[..., self.rotary_dim:] + query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, + self.is_neox_style) + query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + + key = key.view(num_tokens, -1, self.head_size) + key_rot = key[..., :self.rotary_dim] + key_pass = key[..., self.rotary_dim:] + key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, + self.is_neox_style) + key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + return query, key + @classmethod def get_input_positions( cls, From 61f67d8acdb4b77c168d1150e81a5c284c6f8ce7 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Sun, 10 Aug 2025 05:16:11 +0200 Subject: [PATCH 128/932] [V1] [Hybrid] Enable Full CUDA Graph (decode-only) for Mamba layers (#21401) Signed-off-by: Thomas Parnell --- .../models/language/generation/test_hybrid.py | 60 +++++++++++++++++++ vllm/v1/attention/backends/mamba_attn.py | 44 +++++++++++++- 2 files changed, 103 insertions(+), 1 deletion(-) diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 4934da9517..76f6c226ba 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -384,3 +384,63 @@ def test_distributed_correctness( name_0="vllm_tp_1", name_1="vllm_tp_2", ) + + +@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"]) +@pytest.mark.parametrize("max_tokens", [64]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_full_cuda_graph( + hf_runner, + vllm_runner, + example_prompts, + monkeypatch, + model: str, + max_tokens: int, + num_logprobs: int, +) -> None: + + try: + model_info = HF_EXAMPLE_MODELS.find_hf_info(model) + model_info.check_available_online(on_fail="skip") + model_info.check_transformers_version(on_fail="skip") + except ValueError: + pass + + with hf_runner(model) as hf_model: + if model not in HF_UNSUPPORTED_MODELS: + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) + else: + hf_outputs = None + + with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: + vllm_v0_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + if model in HYBRID_MODELS: + # required due to reorder_batch behaviour + m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") + with vllm_runner(model, + max_num_seqs=MAX_NUM_SEQS, + compilation_config={'full_cuda_graph': True}, + enable_prefix_caching=False) as vllm_model: + vllm_v1_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + + if hf_outputs is not None: + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_v0_outputs, + name_0="hf", + name_1="vllm-v0", + ) + + ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs + check_logprobs_close( + outputs_0_lst=ref_outputs, + outputs_1_lst=vllm_v1_outputs, + name_0="hf" if hf_outputs is not None else "vllm-v0", + name_1="vllm-v1", + ) diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py index 66a8d91db8..7c1226049f 100644 --- a/vllm/v1/attention/backends/mamba_attn.py +++ b/vllm/v1/attention/backends/mamba_attn.py @@ -7,8 +7,10 @@ from typing import ClassVar, Optional import torch from vllm.attention.backends.abstract import AttentionBackend +from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.config import VllmConfig -from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, +from vllm.v1.attention.backends.utils import (AttentionCGSupport, + AttentionMetadataBuilder, CommonAttentionMetadata, split_decodes_and_prefills) from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec @@ -82,6 +84,8 @@ class Mamba2AttentionMetadata: class Mamba2AttentionMetadataBuilder( AttentionMetadataBuilder[Mamba2AttentionMetadata]): + attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.PURE_DECODE_ONLY reorder_batch_threshold: ClassVar[int] = 1 @@ -90,8 +94,18 @@ class Mamba2AttentionMetadataBuilder( assert isinstance(kv_cache_spec, MambaSpec) self.kv_cache_spec = kv_cache_spec self.chunk_size = vllm_config.model_config.get_mamba_chunk_size() + self.vllm_config = vllm_config + self.compilation_config = vllm_config.compilation_config assert self.chunk_size is not None, ( "chunk_size needs to be set in the model config for Mamba2 models") + self.decode_cudagraph_max_bs = min( + self.vllm_config.scheduler_config.max_num_seqs, + self.compilation_config.max_capture_size) + self.state_indices_tensor = torch.empty( + (self.decode_cudagraph_max_bs, ), + dtype=torch.int32, + device=device, + ) def build(self, common_prefix_len: int, @@ -144,6 +158,14 @@ class Mamba2AttentionMetadataBuilder( query_start_loc_p, self.chunk_size, num_prefill_tokens)) + elif num_decodes <= self.decode_cudagraph_max_bs: + # Pad state tensor for CUDA graph + num_input_tokens = self.vllm_config.pad_for_cudagraph(num_decodes) + self.state_indices_tensor[:num_decodes].copy_(state_indices_tensor, + non_blocking=True) + state_indices_tensor = self.state_indices_tensor[:num_input_tokens] + state_indices_tensor[num_decodes:] = PAD_SLOT_ID + attn_metadata = Mamba2AttentionMetadata( num_prefills=num_prefills, num_prefill_tokens=num_prefill_tokens, @@ -160,3 +182,23 @@ class Mamba2AttentionMetadataBuilder( state_indices_tensor=state_indices_tensor, ) return attn_metadata + + def build_for_cudagraph_capture( + self, common_attn_metadata: CommonAttentionMetadata): + """ + This method builds the metadata for full cudagraph capture. + Currently, only decode is supported for full cudagraphs with Mamba. + """ + m = common_attn_metadata + + assert m.num_reqs == m.num_actual_tokens, \ + "Mamba only supports decode-only full CUDAGraph capture. " \ + "Make sure all cudagraph capture sizes <= max_num_seq." + + m.max_query_len = 1 # decode-only + + return self.build(0, m) + + def can_run_in_cudagraph( + self, common_attn_metadata: CommonAttentionMetadata) -> bool: + return common_attn_metadata.max_query_len == 1 From 0c5254b82acc625112ce7adc10811514f1a42d52 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sun, 10 Aug 2025 11:19:13 +0800 Subject: [PATCH 129/932] [oss] Init gpt-oss bf16 support (#22508) Signed-off-by: Jee Jee Li --- .../model_executor/layers/fused_moe/config.py | 6 +- .../layers/fused_moe/fused_moe.py | 265 +++++++++++------- vllm/model_executor/layers/fused_moe/layer.py | 40 ++- vllm/model_executor/models/gpt_oss.py | 152 +++++++++- 4 files changed, 340 insertions(+), 123 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index f2242ade0c..31ea826f1f 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -324,6 +324,8 @@ class FusedMoEConfig: max_num_tokens: int = envs.VLLM_MOE_DP_CHUNK_SIZE + has_bias: bool = False + def __post_init__(self): if self.dp_size > 1: logger.debug_once("Using FusedMoEConfig::max_num_tokens=%d", @@ -413,7 +415,8 @@ class FusedMoEConfig: in_dtype: torch.dtype, max_num_tokens: int = envs.VLLM_MOE_DP_CHUNK_SIZE, quant_config: Optional[Union[FusedMoEQuantConfig, - QuantizationConfig]] = None + QuantizationConfig]] = None, + has_bias: bool = False, ) -> "FusedMoEConfig": _quant_config: Optional[FusedMoEQuantConfig] = None @@ -482,4 +485,5 @@ class FusedMoEConfig: in_dtype=in_dtype, quant_config=_quant_config, max_num_tokens=max_num_tokens, + has_bias=has_bias, ) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index f4f5457ebc..3ad5f5b7ad 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -275,6 +275,7 @@ def fused_moe_kernel( a_ptr, b_ptr, c_ptr, + b_bias_ptr, a_scale_ptr, b_scale_ptr, topk_weights_ptr, @@ -302,6 +303,8 @@ def fused_moe_kernel( stride_bse, stride_bsk, stride_bsn, + stride_bbe, # bias expert stride + stride_bbn, # bias N stride # Block size for block-wise quantization group_n: tl.constexpr, group_k: tl.constexpr, @@ -317,6 +320,7 @@ def fused_moe_kernel( use_int8_w8a8: tl.constexpr, use_int8_w8a16: tl.constexpr, per_channel_quant: tl.constexpr, + HAS_BIAS: tl.constexpr, ): """ Implements the fused computation for a Mixture of Experts (MOE) using @@ -414,7 +418,10 @@ def fused_moe_kernel( else: a_scale = tl.load(a_scale_ptr) b_scale = tl.load(b_scale_ptr + off_experts) - + if HAS_BIAS: + # bias shape: [num_experts, N] + bias_ptrs = b_bias_ptr + off_experts * stride_bbe + offs_bn * stride_bbn + bias = tl.load(bias_ptrs, mask=(offs_bn < N), other=0.0) # ----------------------------------------------------------- # Iterate to compute a block of the C matrix. # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block @@ -456,7 +463,8 @@ def fused_moe_kernel( # Advance the ptrs to the next K block. a_ptrs += BLOCK_SIZE_K * stride_ak b_ptrs += BLOCK_SIZE_K * stride_bk - + if HAS_BIAS: + accumulator = accumulator + bias[None, :] if MUL_ROUTED_WEIGHT: moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, @@ -471,6 +479,7 @@ def fused_moe_kernel( accumulator = (accumulator * a_scale * b_scale).to(compute_type) else: accumulator = accumulator.to(compute_type) + # ----------------------------------------------------------- # Write back the block of the output offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) @@ -499,7 +508,8 @@ def invoke_fused_moe_kernel(A: torch.Tensor, use_int8_w8a16: bool, use_int4_w4a16: bool, per_channel_quant: bool, - block_shape: Optional[list[int]] = None) -> None: + block_shape: Optional[list[int]] = None, + B_bias: Optional[torch.Tensor] = None) -> None: assert topk_weights is not None or not mul_routed_weight assert topk_weights is None or topk_weights.stride(1) == 1 assert sorted_token_ids.stride(0) == 1 @@ -531,7 +541,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, A.size(0) * top_k * config['BLOCK_SIZE_M']) grid = lambda META: (triton.cdiv(EM, META['BLOCK_SIZE_M']) * triton.cdiv( B.size(1), META['BLOCK_SIZE_N']), ) - + HAS_BIAS = B_bias is not None if (use_int8_w8a16 or use_int4_w4a16) and \ block_shape is not None and block_shape[1] > 0: assert B_scale is not None and B_scale.ndim == 3 @@ -611,6 +621,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, A, B, C, + B_bias, A_scale, B_scale, topk_weights, @@ -638,6 +649,8 @@ def invoke_fused_moe_kernel(A: torch.Tensor, if B_scale is not None and B_scale.ndim == 3 else 0, B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0, + B_bias.stride(0) if B_bias is not None else 0, + B_bias.stride(1) if B_bias is not None else 0, 0 if block_shape is None else block_shape[0], 0 if block_shape is None else block_shape[1], MUL_ROUTED_WEIGHT=mul_routed_weight, @@ -647,6 +660,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, use_int8_w8a8=use_int8_w8a8, use_int8_w8a16=use_int8_w8a16, per_channel_quant=per_channel_quant, + HAS_BIAS=HAS_BIAS, BLOCK_SIZE_K=BLOCK_SIZE_K, **config, ) @@ -1024,40 +1038,43 @@ def inplace_fused_experts( w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None) -> None: #noqa: UP006 + block_shape: Optional[List[int]] = None, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None) -> None: #noqa: UP006 fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True, activation, is_act_and_mul, apply_router_weight_on_input, use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, use_mxfp4_w4a4, per_channel_quant, global_num_experts, expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, - a2_scale, block_shape) + a2_scale, block_shape, w1_bias, w2_bias) -def inplace_fused_experts_fake( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - activation: str = "silu", - is_act_and_mul: bool = True, - apply_router_weight_on_input: bool = False, - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - use_mxfp4_w4a4: bool = False, - per_channel_quant: bool = False, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - w1_zp: Optional[torch.Tensor] = None, - w2_zp: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None) -> None: +def inplace_fused_experts_fake(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str = "silu", + is_act_and_mul: bool = True, + apply_router_weight_on_input: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_mxfp4_w4a4: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[list[int]] = None, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None) -> None: pass @@ -1246,36 +1263,38 @@ direct_register_custom_op( def outplace_fused_experts( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - activation: str = "silu", - is_act_and_mul: bool = True, - apply_router_weight_on_input: bool = False, - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - use_mxfp4_w4a4: bool = False, - per_channel_quant: bool = False, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - w1_zp: Optional[torch.Tensor] = None, - w2_zp: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None, #noqa: UP006 + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str = "silu", + is_act_and_mul: bool = True, + apply_router_weight_on_input: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_mxfp4_w4a4: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, #noqa: UP006 + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: return fused_experts_impl( hidden_states, w1, w2, topk_weights, topk_ids, False, activation, is_act_and_mul, apply_router_weight_on_input, use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, use_mxfp4_w4a4, per_channel_quant, global_num_experts, expert_map, w1_scale, w2_scale, - w1_zp, w2_zp, a1_scale, a2_scale, block_shape) + w1_zp, w2_zp, a1_scale, a2_scale, block_shape, w1_bias, w2_bias) def outplace_fused_experts_fake( @@ -1300,7 +1319,9 @@ def outplace_fused_experts_fake( w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None) -> torch.Tensor: + block_shape: Optional[list[int]] = None, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None) -> torch.Tensor: return torch.empty_like(hidden_states) @@ -1332,33 +1353,34 @@ def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]: # TODO (bnell): replace this with modular op. Can get rid of inplace/outplace # torch ops. -def fused_experts( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - inplace: bool = False, - activation: str = "silu", - is_act_and_mul: bool = True, - apply_router_weight_on_input: bool = False, - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - use_mxfp4_w4a4: bool = False, - per_channel_quant: bool = False, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - w1_zp: Optional[torch.Tensor] = None, - w2_zp: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None, - allow_deep_gemm: bool = False, - allow_cutlass_block_scaled_grouped_gemm: bool = False) -> torch.Tensor: +def fused_experts(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + inplace: bool = False, + activation: str = "silu", + is_act_and_mul: bool = True, + apply_router_weight_on_input: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_mxfp4_w4a4: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[list[int]] = None, + allow_deep_gemm: bool = False, + allow_cutlass_block_scaled_grouped_gemm: bool = False, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None) -> torch.Tensor: # For now, disable DeepGemm for small N (<= 512) until better # permute/unpermute ops are available. # However, on B200, we use DeepGemm for all cases because they only support @@ -1423,7 +1445,10 @@ def fused_experts( w2_zp=w2_zp, a1_scale=a1_scale, a2_scale=a2_scale, - block_shape=block_shape) + block_shape=block_shape, + w1_bias=w1_bias, + w2_bias=w2_bias, + ) def fused_experts_impl( @@ -1451,6 +1476,8 @@ def fused_experts_impl( a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[list[int]] = None, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: # Check constraints. if use_int4_w4a16: @@ -1591,7 +1618,19 @@ def fused_experts_impl( use_int8_w8a16=use_int8_w8a16, use_int4_w4a16=use_int4_w4a16, per_channel_quant=per_channel_quant, - block_shape=block_shape) + block_shape=block_shape, + B_bias=w1_bias) + + # TODO fused kernel + def swiglu_oai(gate_up): + alpha = 1.702 + limit = 7.0 + gate, up = gate_up[..., ::2], gate_up[..., 1::2] + gate = gate.clamp(min=None, max=limit) + up = up.clamp(min=-limit, max=limit) + glu = gate * torch.sigmoid(gate * alpha) + gated_output = (up + 1) * glu + return gated_output # Activation function with multiplication if activation == "silu" and is_act_and_mul: @@ -1605,6 +1644,8 @@ def fused_experts_impl( intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N)) elif activation == "gelu": intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N)) + elif activation == "swiglu_oai": + intermediate_cache2 = swiglu_oai(intermediate_cache1.view(-1, N)) else: raise ValueError(f"Unsupported FusedMoe activation: {activation}, " f"with is_act_and_mul={is_act_and_mul}.") @@ -1635,7 +1676,8 @@ def fused_experts_impl( use_int8_w8a16=use_int8_w8a16, use_int4_w4a16=use_int4_w4a16, per_channel_quant=per_channel_quant, - block_shape=block_shape) + block_shape=block_shape, + B_bias=w2_bias) ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.size()), out_hidden_states[begin_chunk_idx:end_chunk_idx]) @@ -1672,6 +1714,8 @@ def fused_moe( a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[list[int]] = None, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: """ This function computes a Mixture of Experts (MoE) layer using two sets of @@ -1766,7 +1810,9 @@ def fused_moe( w2_zp=w2_zp, a1_scale=a1_scale, a2_scale=a2_scale, - block_shape=block_shape) + block_shape=block_shape, + w1_bias=w1_bias, + w2_bias=w2_bias) class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): @@ -1937,7 +1983,9 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): use_int8_w8a16=self.use_int8_w8a16, use_int4_w4a16=self.use_int4_w4a16, per_channel_quant=self.per_act_token_quant, - block_shape=self.block_shape) + block_shape=self.block_shape, + B_bias=None # TODO support B_bias + ) self.activation(activation, intermediate_cache2, intermediate_cache1.view(-1, N)) @@ -1948,26 +1996,29 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): intermediate_cache2, a2_scale, self.quant_dtype, self.per_act_token_quant, self.block_shape) - invoke_fused_moe_kernel(qintermediate_cache2, - w2, - intermediate_cache3, - a2q_scale, - w2_scale, - w2_zp, - topk_weights, - sorted_token_ids, - expert_ids, - num_tokens_post_padded, - not apply_router_weight_on_input, - 1, - config, - compute_type=compute_type, - use_fp8_w8a8=self.use_fp8_w8a8, - use_int8_w8a8=self.use_int8_w8a8, - use_int8_w8a16=self.use_int8_w8a16, - use_int4_w4a16=self.use_int4_w4a16, - per_channel_quant=self.per_act_token_quant, - block_shape=self.block_shape) + invoke_fused_moe_kernel( + qintermediate_cache2, + w2, + intermediate_cache3, + a2q_scale, + w2_scale, + w2_zp, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + not apply_router_weight_on_input, + 1, + config, + compute_type=compute_type, + use_fp8_w8a8=self.use_fp8_w8a8, + use_int8_w8a8=self.use_int8_w8a8, + use_int8_w8a16=self.use_int8_w8a16, + use_int4_w4a16=self.use_int4_w4a16, + per_channel_quant=self.per_act_token_quant, + block_shape=self.block_shape, + B_bias=None # TODO support B_bias + ) ops.moe_sum(intermediate_cache3, output) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index d664a92841..d5a89655e3 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -255,7 +255,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): self.fused_experts = fused_experts # type: ignore self.topk_indices_dtype = None self.moe = moe - + self.has_bias = self.moe.has_bias self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled() if self.rocm_aiter_moe_enabled: from .rocm_aiter_fused_moe import rocm_aiter_fused_experts @@ -291,7 +291,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): requires_grad=False) layer.register_parameter("w13_weight", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) - + if self.has_bias: + w13_bias = torch.nn.Parameter(torch.zeros( + num_experts, + 2 * intermediate_size_per_partition, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w13_bias", w13_bias) + set_weight_attrs(w13_bias, extra_weight_attrs) # down_proj (row parallel) w2_weight = torch.nn.Parameter(torch.empty( num_experts, @@ -301,6 +308,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): requires_grad=False) layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) + if self.has_bias: + w2_bias = torch.nn.Parameter(torch.zeros(num_experts, + hidden_size, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w2_bias", w2_bias) + set_weight_attrs(w2_bias, extra_weight_attrs) def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor: # Pad the weight tensor. This is an optimization on ROCm platform, which @@ -465,6 +479,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, + w1_bias=layer.w13_bias if self.has_bias else None, + w2_bias=layer.w2_bias if self.has_bias else None, topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, @@ -702,6 +718,7 @@ class FusedMoE(torch.nn.Module): activation: str = "silu", enable_eplb: bool = False, num_redundant_experts: int = 0, + has_bias: bool = False, ): super().__init__() if params_dtype is None: @@ -793,16 +810,15 @@ class FusedMoE(torch.nn.Module): # since model_config is not set in the pytest test. model_dtype = params_dtype - moe = FusedMoEConfig.make( - num_experts=self.global_num_experts, - experts_per_token=top_k, - hidden_dim=hidden_size, - num_local_experts=self.local_num_experts, - moe_parallel_config=self.moe_parallel_config, - in_dtype=model_dtype, - max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE, - quant_config=quant_config, - ) + moe = FusedMoEConfig.make(num_experts=self.global_num_experts, + experts_per_token=top_k, + hidden_dim=hidden_size, + num_local_experts=self.local_num_experts, + moe_parallel_config=self.moe_parallel_config, + in_dtype=model_dtype, + max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE, + quant_config=quant_config, + has_bias=has_bias) self.moe_config = moe self.quant_config = quant_config diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index feb323a045..6a65bbbe2e 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -160,7 +160,9 @@ class MLPBlock(torch.nn.Module): renormalize=True, quant_config=quant_config, prefix=f"{prefix}.experts", - apply_router_weight_on_input=False) + apply_router_weight_on_input=False, + has_bias=True, + activation="swiglu_oai") def forward(self, x: torch.Tensor) -> torch.Tensor: t = self.norm(x) @@ -262,8 +264,8 @@ class GptOssForCausalLM(nn.Module): sampling_metadata) return logits - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def _load_weights_mxfp4( + self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: rename_mapping = { "self_attn": "attn", "input_layernorm.weight": "attn.norm.weight", @@ -469,3 +471,147 @@ class GptOssForCausalLM(nn.Module): loaded_params.add(renamed_name) return loaded_params + + def _load_weights_other( + self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + rename_mapping = { + "self_attn": "attn", + "input_layernorm.weight": "attn.norm.weight", + "post_attention_layernorm.weight": "mlp.norm.weight", + "embed_tokens": "embedding", + } + + def maybe_rename(name: str) -> str: + for remap_name, new_name in rename_mapping.items(): + if remap_name in name: + return name.replace(remap_name, new_name) + return name + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + + tp_rank = get_tensor_model_parallel_rank() + tp_size = get_tensor_model_parallel_world_size() + intermediate_size = self.model_config.intermediate_size + + per_rank_intermediate_size = cdiv(intermediate_size, tp_size) + # Calculate common slicing bounds for current rank + tp_rank_start = tp_rank * per_rank_intermediate_size + tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, + intermediate_size) + + # Attention heads per rank + heads_per_rank = self.model_config.num_attention_heads // tp_size + head_start = tp_rank * heads_per_rank + + use_ep = self.vllm_config.parallel_config.enable_expert_parallel + ep_size = get_ep_group().world_size + ep_rank = get_ep_group().rank + num_experts = self.model_config.num_local_experts + experts_per_rank = num_experts // ep_size + ep_rank_start = ep_rank * experts_per_rank + ep_rank_end = (ep_rank + 1) * experts_per_rank + + for name, weight in weights: + if ".experts.gate_up_proj" in name and "bias" not in name: + # Handle MLP gate and up projection weights + new_name = name.replace(".experts.gate_up_proj", + ".experts.w13_weight") + + # Extract gate and up projection parts + # since the weight is shuffled, we can slice directly + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, :, + 2 * tp_rank_start:2 * tp_rank_end] + + narrow_weight = narrow_weight.permute(0, 2, 1).contiguous() + param = params_dict[new_name] + + param.copy_(narrow_weight) + loaded_params.add(new_name) + + elif ".experts.down_proj" in name and "bias" not in name: + # Handle MLP down projection weights + new_name = name.replace(".experts.down_proj", + ".experts.w2_weight") + + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, tp_rank_start:tp_rank_end, :] + narrow_weight = narrow_weight.permute(0, 2, 1).contiguous() + param = params_dict[new_name] + + param.copy_(narrow_weight) + loaded_params.add(new_name) + + elif "gate_up_proj_bias" in name: + # Handle MLP gate and up projection biases + new_name = name.replace("gate_up_proj_bias", "w13_bias") + + # Extract gate and up projection bias parts + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, + 2 * tp_rank_start:2 * tp_rank_end] + + param = params_dict[new_name] + + param.copy_(narrow_weight) + loaded_params.add(new_name) + + elif "down_proj_bias" in name: + # Handle MLP down projection bias + new_name = name.replace("down_proj_bias", "w2_bias") + + if use_ep: + weight = weight[ep_rank_start:ep_rank_end, ...] + else: + # (only load on rank 0 to avoid duplication) + if tp_rank != 0: + weight.zero_() + param = params_dict[new_name] + param.copy_(weight) + loaded_params.add(new_name) + elif "sinks" in name: + # Handle attention sinks (distributed across ranks) + name = name.replace("self_attn", "attn") + param = params_dict[name] + narrow_weight = weight.narrow(0, head_start, heads_per_rank) + param.data.copy_(narrow_weight) + loaded_params.add(name) + elif "q_proj" in name or "k_proj" in name or "v_proj" in name: + shard_id = ("q" if "q_proj" in name else + "k" if "k_proj" in name else "v") + name = name.replace("self_attn", "attn") + param_name = name.replace(f"{shard_id}_proj", "qkv") + param = params_dict[param_name] + weight_loader = param.weight_loader + weight_loader(param, weight, loaded_shard_id=shard_id) + loaded_params.add(param_name) + else: + # Handle all other weights with potential renaming + + renamed_name = maybe_rename(name) + if renamed_name not in params_dict: + continue + param = params_dict[renamed_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, weight) + loaded_params.add(renamed_name) + + return loaded_params + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + quant_method = (self.model_config.quantization_config['quant_method'] + if hasattr(self.model_config, "quantization_config") + else None) + if quant_method == "mxfp4": + return self._load_weights_mxfp4(weights) + else: + return self._load_weights_other(weights) From 3d7363e61c0a27bcba9e6694ae9771f9b780ce3d Mon Sep 17 00:00:00 2001 From: Le Chen Date: Sun, 10 Aug 2025 11:21:05 +0800 Subject: [PATCH 130/932] [Config] add "qwen" as a native eagle3 target supported model (#22333) Signed-off-by: lechen Signed-off-by: LeChen --- tests/models/registry.py | 4 +++ tests/v1/e2e/test_spec_decode.py | 39 +++++++++++++----------- vllm/config/__init__.py | 8 +---- vllm/model_executor/models/registry.py | 1 + vllm/transformers_utils/configs/eagle.py | 5 +-- 5 files changed, 30 insertions(+), 27 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index e0939d1a20..898e38a4ae 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -525,6 +525,10 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { trust_remote_code=True, speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", tokenizer="meta-llama/Llama-3.1-8B-Instruct"), + "LlamaForCausalLMEagle3": _HfExamplesInfo("AngelSlim/Qwen3-8B_eagle3", # noqa: E501 + trust_remote_code=True, + speculative_model="AngelSlim/Qwen3-8B_eagle3", + tokenizer="Qwen/Qwen3-8B"), "EagleLlama4ForCausalLM": _HfExamplesInfo( "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", trust_remote_code=True, diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 4950faf826..cd383b58db 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -125,24 +125,27 @@ def test_ngram_correctness( cleanup_dist_env_and_memory() -@pytest.mark.parametrize( - ["model_setup", "mm_enabled"], [ - (("eagle", "meta-llama/Llama-3.1-8B-Instruct", - "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False), - (("eagle3", "meta-llama/Llama-3.1-8B-Instruct", - "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False), - pytest.param( - ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct", - "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), - False, - marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), - pytest.param( - ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct", - "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), - True, - marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), - ], - ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle", "llama4_eagle_mm"]) +@pytest.mark.parametrize(["model_setup", "mm_enabled"], [ + (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False), + (("eagle", "meta-llama/Llama-3.1-8B-Instruct", + "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False), + (("eagle3", "meta-llama/Llama-3.1-8B-Instruct", + "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False), + pytest.param( + ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), + False, + marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), + pytest.param( + ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), + True, + marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), +], + ids=[ + "qwen3_eagle3", "llama3_eagle", "llama3_eagle3", + "llama4_eagle", "llama4_eagle_mm" + ]) @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform()) def test_eagle_correctness( diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 7efab23f14..b2826de93d 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -2852,13 +2852,7 @@ class SpeculativeConfig: "speculative decoding is > 1, but got " f"{self.disable_by_batch_size=}") - from vllm.transformers_utils.configs import SpeculatorsConfig - - eagle3_target_supported = ["llama"] - if self.draft_model_config and isinstance( - self.draft_model_config.hf_config, SpeculatorsConfig): - eagle3_target_supported.append("qwen") - + eagle3_target_supported = ["llama", "qwen"] if self.method == "eagle3" and self.target_model_config and not any( supported_model in self.target_model_config.hf_text_config.model_type diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 3d8694e7b9..aca3d84f00 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -259,6 +259,7 @@ _SPECULATIVE_DECODING_MODELS = { "EagleLlama4ForCausalLM": ("llama4_eagle", "EagleLlama4ForCausalLM"), "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"), "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), + "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"), "MedusaModel": ("medusa", "Medusa"), diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py index 5445a333c4..01217eb191 100644 --- a/vllm/transformers_utils/configs/eagle.py +++ b/vllm/transformers_utils/configs/eagle.py @@ -45,6 +45,7 @@ class EAGLEConfig(PretrainedConfig): # Eagle model name should follow naming convention of # LlamaForCausalLM -> EagleLlamaForCausalLM + # LlamaForCausalLM -> Eagle3LlamaForCausalLM / LlamaForCausalLMEagle3 if method == "eagle": assert self.model is not None, \ "model should not be None when method is eagle" @@ -56,8 +57,8 @@ class EAGLEConfig(PretrainedConfig): assert self.model is not None, \ "model should not be None when method is eagle3" kwargs["architectures"] = [ - f"Eagle3{arch}" if not arch.startswith("Eagle3") \ - else arch for arch in self.model.architectures + arch if arch.startswith("Eagle3") or arch.endswith("Eagle3") + else f"Eagle3{arch}" for arch in self.model.architectures ] else: raise ValueError(f"Invalid method {method}. \ From 534c45b9620d4d97cf2ea2cdee77e8461844a243 Mon Sep 17 00:00:00 2001 From: ZiTian Zhao Date: Sun, 10 Aug 2025 11:25:42 +0800 Subject: [PATCH 131/932] Improve fast_topk function with type hints and documentation (#22530) Signed-off-by: zitian.zhao --- vllm/model_executor/models/utils.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index c69df6e616..6c27fedc61 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -736,7 +736,23 @@ def cast_overflow_tensors( return tensors -def fast_topk(values, topk, dim): +def fast_topk(values: torch.Tensor, topk: int, + dim: int) -> tuple[torch.Tensor, torch.Tensor]: + """ + Optimized topk implementation that uses torch.max for k=1 case. + + This function provides better performance for the common case of k=1 + by using torch.max instead of the more general torch.topk. + + Args: + values: Input tensor to find top-k values from + topk: Number of top values to return (k). Must be > 0. + dim: Dimension along which to compute topk + + Returns: + Tuple of (values, indices) where values are the top-k values + and indices are their corresponding indices in the input tensor + """ if topk == 1: # Use max along the specified dimension to get both value and index return torch.max(values, dim=dim, keepdim=True) From 2a84fb422fc62ab29238dccbf7bdb214fc51c31e Mon Sep 17 00:00:00 2001 From: Chengji Yao Date: Sat, 9 Aug 2025 20:49:04 -0700 Subject: [PATCH 132/932] [TPU] kv cache update kernel doesn't need to be padded slices to multiple of num_slices_per_block (#22394) Signed-off-by: Chengji Yao Co-authored-by: Chengji Yao --- tests/v1/tpu/test_kv_cache_update_kernel.py | 5 ----- vllm/attention/ops/pallas_kv_cache_update.py | 16 ++++++++++------ vllm/v1/worker/tpu_model_runner.py | 19 +++++++++---------- 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/tests/v1/tpu/test_kv_cache_update_kernel.py b/tests/v1/tpu/test_kv_cache_update_kernel.py index f82737325e..acb607247d 100644 --- a/tests/v1/tpu/test_kv_cache_update_kernel.py +++ b/tests/v1/tpu/test_kv_cache_update_kernel.py @@ -43,11 +43,6 @@ def test_kv_cache_update_kernel(page_size: int, combined_kv_head_num: int, np.cumsum(slice_lens[:-1])]) slot_mapping = np.stack( [kv_cache_start_indices, new_kv_cache_indices, slice_lens], axis=1) - padded_size = (slot_mapping.shape[0] + num_slices_per_block - - 1) // num_slices_per_block * num_slices_per_block - slot_mapping = np.pad(slot_mapping, - [[0, padded_size - slot_mapping.shape[0]], [0, 0]], - constant_values=0) slot_mapping = np.transpose(slot_mapping) slot_mapping_cpu = torch.tensor(slot_mapping, device="cpu", diff --git a/vllm/attention/ops/pallas_kv_cache_update.py b/vllm/attention/ops/pallas_kv_cache_update.py index e7d727a45e..d75983bd40 100644 --- a/vllm/attention/ops/pallas_kv_cache_update.py +++ b/vllm/attention/ops/pallas_kv_cache_update.py @@ -14,6 +14,7 @@ def _kv_cache_update_kernel( # Prefetch slices_ref, # [3, padded_num_slices], list of (kv_cache_start, # new_kv_start, slice_len) + num_slices_ref, # [1] # Input new_kv_hbm_ref, # [num_tokens, num_combined_kv_heads, head_dim] kv_cache_hbm_ref, # [total_num_pages * page_size, num_combined_kv_heads, @@ -32,8 +33,10 @@ def _kv_cache_update_kernel( # Copy from new_kv_hbm_ref to scratch for i in range(num_slices_per_block): offset_i = i + block_idx * num_slices_per_block - new_kv_start = slices_ref[1, offset_i] - length = slices_ref[2, offset_i] + new_kv_start = jax.lax.select(offset_i < num_slices_ref[0], + slices_ref[1, offset_i], 0) + length = jax.lax.select(offset_i < num_slices_ref[0], + slices_ref[2, offset_i], 0) async_copy = pltpu.make_async_copy( new_kv_hbm_ref.at[pl.ds(new_kv_start, length), ...], scratch.at[i, pl.ds(0, length), ...], @@ -49,8 +52,10 @@ def _kv_cache_update_kernel( async_copies.clear() for i in range(num_slices_per_block): offset_i = i + block_idx * num_slices_per_block - kv_cache_start = slices_ref[0, offset_i] - length = slices_ref[2, offset_i] + kv_cache_start = jax.lax.select(offset_i < num_slices_ref[0], + slices_ref[0, offset_i], 0) + length = jax.lax.select(offset_i < num_slices_ref[0], + slices_ref[2, offset_i], 0) async_copy = pltpu.make_async_copy( scratch.at[i, pl.ds(0, length), ...], kv_cache_hbm_ref.at[pl.ds(kv_cache_start, length), ...], @@ -77,7 +82,6 @@ def kv_cache_update( page_size: int = 32, num_slices_per_block: int = 8, ): - assert slices.shape[1] % num_slices_per_block == 0 _, num_combined_kv_heads, head_dim = new_kv.shape assert kv_cache.shape[1] == num_combined_kv_heads assert kv_cache.shape[2] == head_dim @@ -93,7 +97,7 @@ def kv_cache_update( out_specs = [pl.BlockSpec(memory_space=pltpu.TPUMemorySpace.ANY)] out_shape = [jax.ShapeDtypeStruct(kv_cache.shape, dtype=kv_cache.dtype)] - scalar_prefetches = [slices] + scalar_prefetches = [slices, num_kv_update_slices] scratch = pltpu.VMEM( (num_slices_per_block, page_size, num_combined_kv_heads, head_dim), new_kv.dtype, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 442c0ea068..915869726f 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -745,7 +745,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_kv_update_slices = slot_mapping_metadata.shape[0] padded_num_slices = _get_padded_num_kv_cache_update_slices( padded_total_num_scheduled_tokens, self.max_num_reqs, - self.block_size, self._num_slices_per_kv_cache_update_block) + self.block_size) slot_mapping_metadata = np.pad( slot_mapping_metadata, [[0, padded_num_slices - len(slot_mapping_metadata)], [0, 0]], @@ -1244,8 +1244,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): position_ids = torch.zeros(num_tokens, dtype=torch.int32).to(self.device) padded_num_slices = _get_padded_num_kv_cache_update_slices( - num_tokens, self.max_num_reqs, self.block_size, - self._num_slices_per_kv_cache_update_block) + num_tokens, self.max_num_reqs, self.block_size) num_kv_update_slices = torch.tensor([padded_num_slices], dtype=torch.int32).to(self.device) slot_mapping = torch.zeros((3, padded_num_slices), @@ -1963,17 +1962,17 @@ def copy_kv_blocks( _copy_fn(src_tensor, dst_tensor, src_indices, dst_indices) -def _get_padded_num_kv_cache_update_slices( - num_tokens: int, max_num_reqs: int, page_size: int, - num_slices_per_kv_cache_update_block: int) -> int: +def _get_padded_num_kv_cache_update_slices(num_tokens: int, max_num_reqs: int, + page_size: int) -> int: """Calculates the padded number of KV cache update slices to avoid recompilation.""" + # NOTE(chengjiyao): let's say R_i is the token num for i-th request, + # so it occupies most 2 + R_i // page_size pages. The total maximum + # possible number of pages needed is sum(2 + R_i // page_size), which + # is <= 2 * max_num_reqs + sum(R_i) // page_size + # = 2 * max_num_reqs + num_tokens // page_size padded_num_slices = 2 * max_num_reqs + num_tokens // page_size padded_num_slices = min(padded_num_slices, num_tokens) - padded_num_slices = ( - padded_num_slices + num_slices_per_kv_cache_update_block - 1 - ) // num_slices_per_kv_cache_update_block * \ - num_slices_per_kv_cache_update_block return padded_num_slices From c49848396d34a1059fbec2a197394484acf5a903 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sun, 10 Aug 2025 04:50:48 +0100 Subject: [PATCH 133/932] Refactor sliding window configuration to Transformers best practice (#21927) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/contributing/model/basic.md | 2 +- tests/test_config.py | 22 ---- vllm/config/__init__.py | 111 ++++++--------------- vllm/engine/arg_utils.py | 10 +- vllm/model_executor/models/commandr.py | 20 ++-- vllm/model_executor/models/exaone4.py | 21 +--- vllm/model_executor/models/gemma2.py | 9 +- vllm/model_executor/models/gemma3.py | 14 +-- vllm/model_executor/models/gemma3_mm.py | 6 +- vllm/model_executor/models/gemma3n.py | 13 ++- vllm/model_executor/models/gritlm.py | 4 +- vllm/model_executor/models/llama.py | 17 +--- vllm/model_executor/models/phi4flash.py | 9 +- vllm/model_executor/models/qwen2.py | 4 +- vllm/model_executor/models/transformers.py | 52 ++-------- vllm/transformers_utils/config.py | 40 ++++++++ 16 files changed, 123 insertions(+), 231 deletions(-) diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index edd9a47e13..21b1f21d60 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -117,7 +117,7 @@ For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `m To support a model with interleaving sliding windows, we need to take care of the following details: -- Make sure the model's `config.json` contains `sliding_window_pattern`. vLLM then sets `self.hf_text_config.interleaved_sliding_window` to the value of `self.hf_text_config.sliding_window` and deletes `sliding_window` from `self.hf_text_config`. The model will then be treated as a full-attention model. +- Make sure the model's `config.json` contains `layer_types`. - In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171). With these two steps, interleave sliding windows should work with the model. diff --git a/tests/test_config.py b/tests/test_config.py index 441c07b99a..19b1b74e42 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -200,28 +200,6 @@ def test_disable_sliding_window(model_id_expected): assert model_config.max_model_len == expected -def test_get_sliding_window(): - TEST_SLIDING_WINDOW = 4096 - # Test that the sliding window is correctly computed. - # For Qwen1.5/Qwen2, get_sliding_window() should be None - # when use_sliding_window is False. - qwen2_model_config = ModelConfig("Qwen/Qwen1.5-7B") - - qwen2_model_config.hf_config.use_sliding_window = False - qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW - assert qwen2_model_config.get_sliding_window() is None - - qwen2_model_config.hf_config.use_sliding_window = True - assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW - - mistral_model_config = ModelConfig("mistralai/Mistral-7B-v0.1") - mistral_model_config.hf_config.sliding_window = None - assert mistral_model_config.get_sliding_window() is None - - mistral_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW - assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW - - @pytest.mark.skipif(current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm.") def test_get_pooling_config(): diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index b2826de93d..49da3fd848 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -40,8 +40,9 @@ from vllm.transformers_utils.config import ( ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, - maybe_override_with_speculators_target_model, try_get_generation_config, - try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope) + is_interleaved, maybe_override_with_speculators_target_model, + try_get_generation_config, try_get_safetensors_metadata, + try_get_tokenizer_config, uses_mrope) from vllm.transformers_utils.s3_utils import S3Model from vllm.transformers_utils.utils import is_s3, maybe_model_redirect # yapf conflicts with isort for this block @@ -714,53 +715,31 @@ class ModelConfig: revision=self.revision, ) - # Workaround for Gemma 2 which uses interleaved sliding window - # attention, but it's not specified in its config. - # TODO: remove this when Gemma 2 config updated in HuggingFace. - if self.hf_text_config.model_type == "gemma2": - self.hf_text_config.sliding_window_pattern = 2 - - # TODO: remove this when Gemma 3n config updated in HuggingFace. - if self.hf_text_config.model_type == "gemma3n_text": - # 4 sliding window attention followed by 1 full attention - self.hf_text_config.sliding_window_pattern = "LLLLG" - - sliding_window = getattr(self.hf_text_config, "sliding_window", None) - sliding_window_pattern = getattr(self.hf_text_config, - "sliding_window_pattern", None) - has_interleaved_attention = sliding_window_pattern is not None or ( - isinstance(sliding_window, list)) - - if not self.disable_sliding_window and has_interleaved_attention: - if not envs.VLLM_USE_V1 and (backend := envs.VLLM_ATTENTION_BACKEND - ) in ("XFORMERS", "FLASHINFER"): - sliding_window_len_min = get_min_sliding_window( - self.hf_text_config.sliding_window) - - logger.warning_once( - "%s has interleaved attention, which is currently not supported by the %s backend. Disabling sliding window and capping the max length to the sliding window size (%d).", # noqa: E501 - self.hf_text_config.model_type, - backend, - sliding_window_len_min, - ) - self.disable_sliding_window = True - else: - # for a model with interleaved attention, - # the scheduler and the model treat it as full attention - # (i.e., not dropping any tokens outside the window). - # only the attention layer itself is aware of the sliding - # window, and use the window size to compute the attention. - self.hf_text_config.interleaved_sliding_window = sliding_window - - if hasattr(self.hf_text_config, "sliding_window"): - delattr(self.hf_text_config, "sliding_window") - - sliding_window = None + # Interleaved attention is not supported by some backends in V0 + if (not self.disable_sliding_window + and is_interleaved(self.hf_text_config) + and not envs.VLLM_USE_V1 + and (backend := envs.VLLM_ATTENTION_BACKEND) + in ("XFORMERS", "FLASHINFER")): + logger.warning_once( + "%s has interleaved attention, which is currently not " + "supported by the %s backend. Disabling sliding window and " + "capping the max length to the sliding window size (%d).", + self.hf_text_config.model_type, + backend, + self.hf_text_config.sliding_window, + ) + self.disable_sliding_window = True self.original_max_model_len = self.max_model_len self.max_model_len = self.get_and_verify_max_len(self.max_model_len) self.multimodal_config = self._init_multimodal_config() + if self.disable_sliding_window: + # Set after get_and_verify_max_len to ensure that max_model_len + # can be correctly capped to sliding window size + self.hf_text_config.sliding_window = None + if not self.skip_tokenizer_init: self._verify_tokenizer_mode() @@ -1322,27 +1301,10 @@ class ModelConfig: if self.use_async_output_proc: self.use_async_output_proc = False - def get_hf_config_sliding_window( - self) -> Union[Optional[int], list[Optional[int]]]: - """Get the sliding window size, or None if disabled.""" - - # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in - # addition to sliding window size. We check if that field is present - # and if it's False, return None. - if (hasattr(self.hf_text_config, "use_sliding_window") - and not self.hf_text_config.use_sliding_window): - return None + def get_sliding_window(self) -> Optional[int]: + """Get the sliding window size from the HF text config if present.""" return getattr(self.hf_text_config, "sliding_window", None) - def get_sliding_window(self) -> Optional[Union[int, list[Optional[int]]]]: - """Get the sliding window size, or None if disabled. - """ - # If user disables sliding window, return None. - if self.disable_sliding_window: - return None - # Otherwise get the value from the hf config. - return self.get_hf_config_sliding_window() - def get_vocab_size(self) -> int: return getattr(self.hf_text_config, "vocab_size", 0) @@ -1762,7 +1724,7 @@ class ModelConfig: tokenizer_config=tokenizer_config, max_model_len=max_model_len, disable_sliding_window=self.disable_sliding_window, - sliding_window_len=self.get_hf_config_sliding_window(), + sliding_window=self.get_sliding_window(), spec_target_max_model_len=self.spec_target_max_model_len, encoder_config=self.encoder_config) logger.info("Using max model len %s", max_model_len) @@ -3305,7 +3267,7 @@ def _get_and_verify_max_len( tokenizer_config: Optional[dict], max_model_len: Optional[int], disable_sliding_window: bool, - sliding_window_len: Optional[Union[int, list[Optional[int]]]], + sliding_window: Optional[int], spec_target_max_model_len: Optional[int] = None, encoder_config: Optional[Any] = None, ) -> int: @@ -3344,13 +3306,10 @@ def _get_and_verify_max_len( # If sliding window is manually disabled, max_length should be less # than the sliding window length in the model config. - if disable_sliding_window and sliding_window_len is not None: - - sliding_window_len_min = get_min_sliding_window(sliding_window_len) - max_len_key = "sliding_window" \ - if sliding_window_len_min < derived_max_model_len else max_len_key - derived_max_model_len = min(derived_max_model_len, - sliding_window_len_min) + if (disable_sliding_window and sliding_window is not None + and sliding_window < derived_max_model_len): + max_len_key = "sliding_window" + derived_max_model_len = sliding_window # Consider model_max_length in tokenizer_config if tokenizer_config: @@ -3451,14 +3410,6 @@ def _get_and_verify_max_len( return int(max_model_len) -def get_min_sliding_window( - sliding_window: Union[int, list[Optional[int]]]) -> int: - if isinstance(sliding_window, list): - return min(s for s in sliding_window if s is not None) - - return sliding_window - - def get_served_model_name(model: str, served_model_name: Optional[Union[str, list[str]]]): """ diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 4d4ce4c78e..4767201617 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -39,6 +39,7 @@ from vllm.plugins import load_general_plugins from vllm.ray.lazy_utils import is_ray_initialized from vllm.reasoning import ReasoningParserManager from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 +from vllm.transformers_utils.config import is_interleaved from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor) @@ -1081,6 +1082,13 @@ class EngineArgs: "DualChunkFlashAttention is not supported on V1 engine. " "To run the model in V0 engine, try set 'VLLM_USE_V1=0'") + sliding_window: Optional[int] = None + if not is_interleaved(model_config.hf_text_config): + # Only set CacheConfig.sliding_window if the model is all sliding + # window. Otherwise CacheConfig.sliding_window will override the + # global layers in interleaved sliding window models. + sliding_window = model_config.get_sliding_window() + cache_config = CacheConfig( block_size=self.block_size, gpu_memory_utilization=self.gpu_memory_utilization, @@ -1088,7 +1096,7 @@ class EngineArgs: cache_dtype=self.kv_cache_dtype, is_attention_free=model_config.is_attention_free, num_gpu_blocks_override=self.num_gpu_blocks_override, - sliding_window=model_config.get_sliding_window(), + sliding_window=sliding_window, enable_prefix_caching=self.enable_prefix_caching, prefix_caching_hash_algo=self.prefix_caching_hash_algo, cpu_offload_gb=self.cpu_offload_gb, diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 69281abf73..4dd84b8f8f 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -182,21 +182,13 @@ class CohereAttention(nn.Module): ) # Model v2 has interleaved sliding windows, v1 does not - interleaved_sliding_window = getattr(config, - "interleaved_sliding_window", - None) - self.v1 = interleaved_sliding_window is None + self.v1 = isinstance(config, CohereConfig) - layer_idx = extract_layer_index(prefix) - layer_has_sliding_window = ( - getattr(config, "sliding_window_pattern", False) and - (layer_idx + 1) % self.config.sliding_window_pattern - != 0) or (getattr(config, "layer_types", False) - and config.layer_types[layer_idx] == "sliding_attention") - - self.sliding_window = (interleaved_sliding_window - or config.sliding_window - if layer_has_sliding_window else None) + self.sliding_window = None + if not self.v1: + layer_idx = extract_layer_index(prefix) + if config.layer_types[layer_idx] == "sliding_attention": + self.sliding_window = config.sliding_window self.attn = Attention(self.num_heads, self.head_dim, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index ecd942a76a..827e901418 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -159,25 +159,12 @@ class Exaone4Attention(nn.Module): if quant_config is not None and quant_config.get_name() == "gguf": is_neox_style = False - self.apply_all_layers = False # apply rotary embeddings to every layer. layer_idx = extract_layer_index(prefix) - interleaved_sliding_window = getattr(config, - "interleaved_sliding_window", - 4096) - sliding_window_pattern = getattr(config, "sliding_window_pattern", - "LLLG") + is_sliding = config.layer_types[layer_idx] == "sliding_attention" + self.sliding_window = config.sliding_window if is_sliding else None - if sliding_window_pattern: - layer_has_sliding_window = ( - layer_idx + 1) % sliding_window_pattern.__len__() != 0 - else: - layer_has_sliding_window = False - self.apply_all_layers = True - - if layer_has_sliding_window: - self.sliding_window = interleaved_sliding_window - else: - self.sliding_window = None + # apply rotary embeddings to every layer + self.apply_all_layers = not is_sliding self.rotary_emb = get_rope( self.head_dim, diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 8beefb2cd0..8cfe92c645 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -144,13 +144,10 @@ class Gemma2Attention(nn.Module): is_neox_style=True, ) - # reference: - # https://github.com/huggingface/transformers/blob/54be2d7ae87e873482b984cc956e165ca4dc0ba3/src/transformers/models/gemma2/modeling_gemma2.py#L312 # noqa layer_idx = extract_layer_index(prefix) - use_sliding_window = (layer_idx % 2 == 0 and getattr( - config, "interleaved_sliding_window", None) is not None) - sliding_window = config.interleaved_sliding_window if \ - use_sliding_window else None + is_sliding = config.layer_types[layer_idx] == "sliding_attention" + sliding_window = config.sliding_window if is_sliding else None + self.attn = Attention(self.num_heads, self.head_dim, self.scaling, diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 1a2ce65d1e..b762be3c52 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -146,25 +146,19 @@ class Gemma3Attention(nn.Module): self.q_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps) self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps) - # TODO(woosuk): Add reference to the original HF implementation. layer_idx = extract_layer_index(prefix) - self.is_sliding = (getattr( - config, "interleaved_sliding_window", None) is not None and (bool( - (layer_idx + 1) % config.sliding_window_pattern))) or ( - getattr(config, "layer_types", None) is not None - and config.layer_types[layer_idx] == "sliding_attention") + self.is_sliding = config.layer_types[layer_idx] == "sliding_attention" + sliding_window = config.sliding_window if self.is_sliding else None + # Initialize the rotary embedding. if self.is_sliding: # Local attention. Override the values in config.json. self.rope_theta = config.rope_local_base_freq self.rope_scaling = {"rope_type": "default"} - self.sliding_window = (config.interleaved_sliding_window - or config.sliding_window) else: # Global attention. Use the values in config.json. self.rope_theta = config.rope_theta self.rope_scaling = config.rope_scaling - self.sliding_window = None self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, @@ -182,7 +176,7 @@ class Gemma3Attention(nn.Module): cache_config=cache_config, quant_config=quant_config, logits_soft_cap=attn_logits_soft_cap, - per_layer_sliding_window=self.sliding_window, + per_layer_sliding_window=sliding_window, prefix=f"{prefix}.attn") def forward( diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index e9ee1ebdcc..9871b11b37 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -502,8 +502,6 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, self.config = config self.quant_config = quant_config self.multimodal_config = multimodal_config - self.sliding_window = getattr(config.text_config, - "interleaved_sliding_window", None) self.vision_tower = SiglipVisionModel(config.vision_config, quant_config, @@ -690,11 +688,11 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask) global_attn_masks.append(global_attn_mask) - if self.sliding_window is not None: + if (sliding_window := self.config.sliding_window) is not None: # Create a local causal mask with sliding window (1024). local_attn_mask = torch.ones_like(global_attn_mask) local_attn_mask = torch.tril(local_attn_mask, - diagonal=-self.sliding_window) + diagonal=-sliding_window) local_attn_mask = torch.where(local_attn_mask == 0, global_attn_mask, float("-inf")) local_attn_masks.append(local_attn_mask) diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index 4b41cba1c7..ffec340870 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -313,17 +313,16 @@ class Gemma3nAttention(nn.Module): has_weight=False) layer_idx = extract_layer_index(prefix) + is_sliding = config.layer_types[layer_idx] == "sliding_attention" + self.sliding_window = config.sliding_window if is_sliding else None - is_sliding_window = ( - getattr(config, "interleaved_sliding_window", None) is not None - and config.layer_types[layer_idx] == "sliding_attention") - - if is_sliding_window: - self.sliding_window = config.interleaved_sliding_window + # Initialize the rotary embedding. + if is_sliding: + # Local attention. Override the values in config.json. rope_theta = config.rope_local_base_freq rope_scaling = {"rope_type": "default"} else: - self.sliding_window = None + # Global attention. Use the values in config.json. rope_theta = config.rope_theta rope_scaling = config.rope_scaling diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index c99970284a..9e7490e3c4 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -248,9 +248,7 @@ class GritLM(LlamaForCausalLM, SupportsV0Only): vllm_config.cache_config.sliding_window = None - for attr in ("sliding_window", "interleaved_sliding_window"): - if hasattr(hf_config, attr): - delattr(hf_config, attr) + hf_config.sliding_window = None super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 48ec611df1..bc511d8339 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -167,18 +167,11 @@ class LlamaAttention(nn.Module): rope_scaling=rope_scaling, quant_config=quant_config) - if hasattr(config, "interleaved_sliding_window"): - interleaved_sliding_window = config.interleaved_sliding_window - if isinstance(interleaved_sliding_window, int): - sliding_window = interleaved_sliding_window - elif isinstance(interleaved_sliding_window, list): - sw_idx = layer_idx % len(interleaved_sliding_window) - sliding_window = interleaved_sliding_window[sw_idx] - else: - raise ValueError( - f"{type(interleaved_sliding_window)} is not supported.") - else: - sliding_window = None + sliding_window = None + if layer_types := getattr(config, "layer_types", None): + is_sliding = layer_types[layer_idx] == "sliding_attention" + if is_sliding: + sliding_window = config.sliding_window self.attn = Attention( self.num_heads, diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py index 1a761d01fc..493a4192d3 100644 --- a/vllm/model_executor/models/phi4flash.py +++ b/vllm/model_executor/models/phi4flash.py @@ -116,13 +116,8 @@ class SambaYAttention(nn.Module): self.Wqkv = nn.Linear(self.hidden_size, op_size, bias=True) # disable sliding window for the second half of the model - sliding_window = config.interleaved_sliding_window[layer_idx] - if layer_idx >= config.num_hidden_layers // 2: - assert sliding_window is None, \ - "sliding_window must be none for the second decoder" - else: - assert sliding_window is not None, \ - "sliding_window must be set for the first decoder" + is_sliding = config.layer_types[layer_idx] == "sliding_attention" + sliding_window = config.sliding_window if is_sliding else None assert self.num_heads % 2 == 0, 'num_heads should be even' assert self.num_key_value_heads % 2 == 0, 'num_heads should be even' diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index e4f0de04e9..7304fbf120 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -49,6 +49,7 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import is_interleaved from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index, @@ -285,8 +286,7 @@ class Qwen2Model(nn.Module): quant_config = vllm_config.quant_config # TODO (@robertgshaw2): see if this can be moved out - if (cache_config.sliding_window is not None - and hasattr(config, "max_window_layers")): + if is_interleaved(vllm_config.model_config.hf_text_config): assert config.max_window_layers == config.num_hidden_layers, ( "Sliding window for some but all layers is not supported. " "This model uses sliding window but `max_window_layers` = {} " diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 92e132045c..fc4585618b 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -16,7 +16,7 @@ # limitations under the License. """Wrapper around `transformers` models""" from collections.abc import Iterable, Mapping -from contextlib import contextmanager, nullcontext +from contextlib import contextmanager from typing import Literal, Optional, Union import regex as re @@ -382,33 +382,6 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]): ) -class ConfigOverride: - """Context manager to temporarily override config attributes.""" - - def __init__(self, config: PretrainedConfig, **kwargs): - self.config = config - self.kwargs = kwargs - self.kwargs_original = {} - self.kwargs_delete = set() - - def __enter__(self): - """Override config attributes.""" - for key, value in self.kwargs.items(): - if not hasattr(self.config, key): - self.kwargs_delete.add(key) - self.kwargs_original[key] = getattr(self.config, key, None) - setattr(self.config, key, value) - return self.config - - def __exit__(self, exc_type, exc_value, traceback): - """Restore original config attributes.""" - for key, value in self.kwargs_original.items(): - if key in self.kwargs_delete: - delattr(self.config, key) - else: - setattr(self.config, key, value) - - class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): embedding_padding_modules = ["lm_head"] embedding_modules = ["embed_tokens" @@ -434,21 +407,11 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): # To be updated in child classes for use in `load_weights` self.skip_prefixes: Optional[list[str]] = None - # vLLM handles interleaved sliding window attention by creating a new - # interleaved_sliding_window attribute and deleting the sliding_window - # attribute. This breaks the constructors in Transformers so we - # temporarily add the attribute back to construct the model. - config_override = nullcontext() - if hasattr(self.config, "interleaved_sliding_window"): - config_override = ConfigOverride( - self.config, - sliding_window=self.config.interleaved_sliding_window) - # Set correct attn and init on "meta" to delay allocating GPU tensors # TODO: @raushan, use the public `model.set_attn_implementation()` # method once its checks are fixed in Transformers. self.text_config._attn_implementation = "vllm" - with init_on_device_without_buffers("meta"), config_override: + with init_on_device_without_buffers("meta"): self.model: PreTrainedModel = AutoModel.from_config( self.config, torch_dtype=self.model_config.dtype, @@ -575,11 +538,10 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): attention_instances = {} for i in range(start, end): # Handle interleaved sliding window attention - sliding_window = None - if (hasattr(self.config, "interleaved_sliding_window") - and hasattr(self.config, "sliding_window_pattern") - and ((i + 1) % self.config.sliding_window_pattern > 0)): - sliding_window = self.config.interleaved_sliding_window + per_layer_sliding_window = None + if (hasattr(self.config, "layer_types") + and self.config.layer_types[i] == "sliding_attention"): + per_layer_sliding_window = self.config.sliding_window attention_instances[i] = Attention( num_heads=num_heads, @@ -590,7 +552,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): num_kv_heads=num_kv_heads, cache_config=self.cache_config, quant_config=self.quant_config, - per_layer_sliding_window=sliding_window, + per_layer_sliding_window=per_layer_sliding_window, prefix=f"{i}.attn") return attention_instances diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index de779f94a4..6b70164c8c 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -280,6 +280,17 @@ def is_encoder_decoder(config: PretrainedConfig) -> bool: return getattr(config, "is_encoder_decoder", False) +def is_interleaved(config: PretrainedConfig) -> bool: + """ + Detect if the model with this config is used with interleaved attention. + """ + text_config = config.get_text_config() + if layer_types := getattr(text_config, "layer_types", None): + interleaved_types = {"full_attention", "sliding_attention"} + return interleaved_types.issubset(layer_types) + return False + + def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig: """Remap config attributes to match the expected names.""" for old_attr, new_attr in _CONFIG_ATTRS_MAPPING.items(): @@ -423,6 +434,23 @@ def get_config( raise e config = _maybe_remap_hf_config_attrs(config) + # Phi4Flash misuses this config as list[int]. Convert it to int and add + # the layer_types list[str] to make it HF compatible + if (config.model_type == "phi4flash"): + # TODO: Remove after the following PR is merged: + # https://huggingface.co/microsoft/Phi-4-mini-flash-reasoning/discussions/6 + if not hasattr(config, "layer_types"): + config.layer_types = [ + "sliding_attention" if i < config.num_hidden_layers // 2 + and i % 2 == 1 else "full_attention" + for i in range(config.num_hidden_layers) + ] + # TODO: Remove after the following PR is merged: + # https://huggingface.co/microsoft/Phi-4-mini-flash-reasoning/discussions/7 + if isinstance(config.sliding_window, list): + config.sliding_window = next( + filter(None, config.sliding_window), None) + elif config_format == ConfigFormat.MISTRAL: # This function loads a params.json config which # should be used when loading models in mistral format @@ -434,6 +462,18 @@ def get_config( config_dict["max_position_embeddings"] = max_position_embeddings config = adapt_config_dict(config_dict) + + # Mistral configs may define sliding_window as list[int]. Convert it + # to int and add the layer_types list[str] to make it HF compatible + if ((sliding_window := getattr(config, "sliding_window", None)) + and isinstance(sliding_window, list)): + pattern_repeats = config.num_hidden_layers // len(sliding_window) + layer_types = sliding_window * pattern_repeats + config.layer_types = [ + "full_attention" if layer_type is None else "sliding_attention" + for layer_type in layer_types + ] + config.sliding_window = next(filter(None, sliding_window), None) else: supported_formats = [ fmt.value for fmt in ConfigFormat if fmt != ConfigFormat.AUTO From 7e8d685775fe9e11c3cea79e84418a9f0bab4a5f Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 10 Aug 2025 15:08:23 +0800 Subject: [PATCH 134/932] [Minor] Fix pre-commit error on main (#22579) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/layers/fused_moe/fused_moe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 3ad5f5b7ad..86cc6e0e5d 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1038,9 +1038,9 @@ def inplace_fused_experts( w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None, + block_shape: Optional[List[int]] = None, #noqa: UP006 w1_bias: Optional[torch.Tensor] = None, - w2_bias: Optional[torch.Tensor] = None) -> None: #noqa: UP006 + w2_bias: Optional[torch.Tensor] = None) -> None: fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True, activation, is_act_and_mul, apply_router_weight_on_input, use_fp8_w8a8, From 326976291b541f0fd5bef34aa1ff4a84bf8fb37d Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Sun, 10 Aug 2025 15:08:48 +0800 Subject: [PATCH 135/932] [Misc] code clean duplicate set_current_vllm_config in _set_vllm_config (#22566) Signed-off-by: Andy Xie --- tests/kernels/moe/modular_kernel_tools/parallel_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py index 1f8d21a7a7..459b785e65 100644 --- a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py +++ b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py @@ -36,7 +36,6 @@ def _set_vllm_config(vllm_config: VllmConfig, world_size: int, rank: int, import tempfile temp_file = tempfile.mkstemp()[1] - set_current_vllm_config(vllm_config) with set_current_vllm_config(vllm_config): init_distributed_environment( world_size=world_size, From 010e0e39ea49508a94ad42062505d7629e19b8d2 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Sun, 10 Aug 2025 01:35:22 -0700 Subject: [PATCH 136/932] [Doc] Fix API doc link in side navigation (#22585) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- docs/.nav.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/.nav.yml b/docs/.nav.yml index 77342e2674..f57703c329 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -1,5 +1,5 @@ nav: - - Home: + - Home: - vLLM: README.md - Getting Started: - getting_started/quickstart.md @@ -11,7 +11,7 @@ nav: - Quick Links: - User Guide: usage/README.md - Developer Guide: contributing/README.md - - API Reference: api/README.md + - API Reference: api/summary.md - CLI Reference: cli/README.md - Timeline: - Roadmap: https://roadmap.vllm.ai @@ -49,7 +49,7 @@ nav: - General: - glob: contributing/* flatten_single_child_sections: true - - Model Implementation: + - Model Implementation: - contributing/model/README.md - contributing/model/basic.md - contributing/model/registration.md From d411df029648ff8107bddf89594b101879960491 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 10 Aug 2025 20:49:48 +0800 Subject: [PATCH 137/932] [Misc] Further refine type annotations in parallel state (#22499) Signed-off-by: DarkLight1337 --- vllm/distributed/eplb/eplb_state.py | 3 --- vllm/distributed/parallel_state.py | 36 +++++++++++++++-------------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index c415d409f7..979f2a06ce 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -259,7 +259,6 @@ class EplbState: if global_expert_load is not None: ep_group = get_ep_group().device_group - assert ep_group is not None assert global_expert_load.shape == (model.num_moe_layers, model.num_logical_experts) assert global_expert_load.dtype == torch.int64 @@ -366,7 +365,6 @@ class EplbState: # Collect load metrics from all ranks ep_group = get_ep_group().device_group - assert ep_group is not None all_reduce(total_expert_load_pass, group=ep_group) # num_tokens_per_rank: (num_moe_layers, num_ranks) @@ -422,7 +420,6 @@ class EplbState: """ ep_group = get_ep_group().device_group - assert ep_group is not None ep_rank = ep_group.rank() time_start = None diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 0b3993ca02..b89aee99c8 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -197,11 +197,10 @@ class GroupCoordinator: # 3 | 1 | 3 | 1 | 3 local_rank: int # local rank used to assign devices rank_in_group: int # rank inside the group - cpu_group: Optional[ProcessGroup] # group for CPU communication - device_group: Optional[ProcessGroup] # group for device communication - use_device_communicator: bool # whether to use device communicator - device_communicator: Optional[ - DeviceCommunicatorBase] # device communicator + cpu_group: ProcessGroup # group for CPU communication + device_group: ProcessGroup # group for device communication + # device communicator (if use_device_communicator=True) + device_communicator: Optional[DeviceCommunicatorBase] mq_broadcaster: Optional[Any] # shared memory broadcaster def __init__( @@ -209,7 +208,7 @@ class GroupCoordinator: group_ranks: list[list[int]], local_rank: int, torch_distributed_backend: Union[str, Backend], - use_device_communicator: bool, + use_device_communicator: bool, # whether to use device communicator use_message_queue_broadcaster: bool = False, group_name: Optional[str] = None, ): @@ -219,8 +218,9 @@ class GroupCoordinator: self.rank = torch.distributed.get_rank() self.local_rank = local_rank - self.device_group = None - self.cpu_group = None + + self_device_group = None + self_cpu_group = None for ranks in group_ranks: device_group = torch.distributed.new_group( @@ -232,11 +232,14 @@ class GroupCoordinator: self.ranks = ranks self.world_size = len(ranks) self.rank_in_group = ranks.index(self.rank) - self.device_group = device_group - self.cpu_group = cpu_group + self_device_group = device_group + self_cpu_group = cpu_group - assert self.cpu_group is not None - assert self.device_group is not None + assert self_cpu_group is not None + assert self_device_group is not None + + self.cpu_group = self_cpu_group + self.device_group = self_device_group from vllm.platforms import current_platform @@ -251,7 +254,6 @@ class GroupCoordinator: self.device = torch.device("cpu") self.use_device_communicator = use_device_communicator - self.device_communicator = None if use_device_communicator and self.world_size > 1: device_comm_cls = resolve_obj_by_qualname( @@ -817,12 +819,12 @@ class GroupCoordinator: return self.device_communicator.recv(size, dtype, src) def destroy(self): - if self.device_group is not None: + if hasattr(self, "device_group"): torch.distributed.destroy_process_group(self.device_group) - self.device_group = None - if self.cpu_group is not None: + del self.device_group + if hasattr(self, "cpu_group"): torch.distributed.destroy_process_group(self.cpu_group) - self.cpu_group = None + del self.cpu_group if self.device_communicator is not None: self.device_communicator.destroy() if self.mq_broadcaster is not None: From 00976db0c311be2b0bbc6f7769918f61a8d17bcf Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sun, 10 Aug 2025 13:49:51 +0100 Subject: [PATCH 138/932] [Docs] Fix warnings in docs build (#22588) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/api/summary.md | 2 - docs/configuration/tpu.md | 2 +- docs/contributing/model/multimodal.md | 8 +- docs/models/generative_models.md | 4 +- docs/models/pooling_models.md | 2 +- docs/models/supported_models.md | 2 +- vllm/attention/layers/__init__.py | 0 vllm/inputs/__init__.py | 10 +- vllm/model_executor/warmup/__init__.py | 0 vllm/sampling_params.py | 140 +++++++++++-------------- 10 files changed, 80 insertions(+), 90 deletions(-) create mode 100644 vllm/attention/layers/__init__.py create mode 100644 vllm/model_executor/warmup/__init__.py diff --git a/docs/api/summary.md b/docs/api/summary.md index db4dab0ae5..327472df1d 100644 --- a/docs/api/summary.md +++ b/docs/api/summary.md @@ -1,7 +1,5 @@ # Summary -[](){ #configuration } - ## Configuration API documentation for vLLM's configuration classes. diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md index a2941c80bd..a93435ed71 100644 --- a/docs/configuration/tpu.md +++ b/docs/configuration/tpu.md @@ -96,7 +96,7 @@ Although it’s common to do this with GPUs, don't try to fragment 2 or 8 differ ### Tune your workloads -Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case. +Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](gh-file:benchmarks/auto_tune/README.md) to optimize your workloads for your use case. ### Future Topics We'll Cover diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 3295b8c711..64a48be326 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -540,8 +540,10 @@ return a schema of the tensors outputted by the HF processor that are related to The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore `(1, num_images, num_patches, patch_width * patch_height * num_channels)`. - In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA, - we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]: + In order to support the use of + [MultiModalFieldConfig.batched][vllm.multimodal.inputs.MultiModalFieldConfig.batched] + like in LLaVA, we remove the extra batch dimension by overriding + [BaseMultiModalProcessor._call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor]: ??? code @@ -816,7 +818,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies After you have defined [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo] (Step 2), [BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] (Step 3), and [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] (Step 4), -decorate the model class with [MULTIMODAL_REGISTRY.register_processor][vllm.multimodal.processing.MultiModalRegistry.register_processor] +decorate the model class with [MULTIMODAL_REGISTRY.register_processor][vllm.multimodal.registry.MultiModalRegistry.register_processor] to register them to the multi-modal registry: ```diff diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md index a3ad413593..a64ecd31eb 100644 --- a/docs/models/generative_models.md +++ b/docs/models/generative_models.md @@ -4,7 +4,7 @@ vLLM provides first-class support for generative models, which covers most of LL In vLLM, generative models implement the[VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface. Based on the final hidden states of the input, these models output log probabilities of the tokens to generate, -which are then passed through [Sampler][vllm.model_executor.layers.Sampler] to obtain the final text. +which are then passed through [Sampler][vllm.model_executor.layers.sampler.Sampler] to obtain the final text. ## Configuration @@ -19,7 +19,7 @@ Run a model in generation mode via the option `--runner generate`. ## Offline Inference The [LLM][vllm.LLM] class provides various methods for offline inference. -See [configuration][configuration] for a list of options when initializing the model. +See [configuration](../api/summary.md#configuration) for a list of options when initializing the model. ### `LLM.generate` diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index c6588363b6..39f209d0eb 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -81,7 +81,7 @@ which takes priority over both the model's and Sentence Transformers's defaults. ## Offline Inference The [LLM][vllm.LLM] class provides various methods for offline inference. -See [configuration][configuration] for a list of options when initializing the model. +See [configuration](../api/summary.md#configuration) for a list of options when initializing the model. ### `LLM.embed` diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 5c48998ba4..ddab7ad5d9 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -770,7 +770,7 @@ The following table lists those that are tested in vLLM. Cross-encoder and reranker models are a subset of classification models that accept two prompts as input. These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API. -| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][parallelism-scaling] | [V1](gh-issue:8779) | +| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------| | `JinaVLForSequenceClassification` | JinaVL-based | T + IE+ | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ | diff --git a/vllm/attention/layers/__init__.py b/vllm/attention/layers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index 37bf2b7a44..aef7841e71 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -1,10 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from .data import (DecoderOnlyInputs, EmbedsInputs, EncoderDecoderInputs, - ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType, - SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs, - TokensPrompt, build_explicit_enc_dec_prompt, embeds_inputs, +from .data import (DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt, + EncoderDecoderInputs, ExplicitEncoderDecoderPrompt, + ProcessorInputs, PromptType, SingletonInputs, + SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt, + build_explicit_enc_dec_prompt, embeds_inputs, to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts) from .registry import (DummyData, InputContext, InputProcessingContext, InputRegistry) @@ -24,6 +25,7 @@ __all__ = [ "ExplicitEncoderDecoderPrompt", "TokenInputs", "EmbedsInputs", + "EmbedsPrompt", "token_inputs", "embeds_inputs", "DecoderOnlyInputs", diff --git a/vllm/model_executor/warmup/__init__.py b/vllm/model_executor/warmup/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 52e4cbd096..df4cca9ba1 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -103,113 +103,89 @@ class SamplingParams( Overall, we follow the sampling parameters from the OpenAI text completion API (https://platform.openai.com/docs/api-reference/completions/create). In addition, we support beam search, which is not supported by OpenAI. - - Args: - n: Number of output sequences to return for the given prompt. - best_of: Number of output sequences that are generated from the prompt. - From these `best_of` sequences, the top `n` sequences are returned. - `best_of` must be greater than or equal to `n`. By default, - `best_of` is set to `n`. Warning, this is only supported in V0. - presence_penalty: Float that penalizes new tokens based on whether they - appear in the generated text so far. Values > 0 encourage the model - to use new tokens, while values < 0 encourage the model to repeat - tokens. - frequency_penalty: Float that penalizes new tokens based on their - frequency in the generated text so far. Values > 0 encourage the - model to use new tokens, while values < 0 encourage the model to - repeat tokens. - repetition_penalty: Float that penalizes new tokens based on whether - they appear in the prompt and the generated text so far. Values > 1 - encourage the model to use new tokens, while values < 1 encourage - the model to repeat tokens. - temperature: Float that controls the randomness of the sampling. Lower - values make the model more deterministic, while higher values make - the model more random. Zero means greedy sampling. - top_p: Float that controls the cumulative probability of the top tokens - to consider. Must be in (0, 1]. Set to 1 to consider all tokens. - top_k: Integer that controls the number of top tokens to consider. Set - to 0 (or -1) to consider all tokens. - min_p: Float that represents the minimum probability for a token to be - considered, relative to the probability of the most likely token. - Must be in [0, 1]. Set to 0 to disable this. - seed: Random seed to use for the generation. - stop: list of strings that stop the generation when they are generated. - The returned output will not contain the stop strings. - stop_token_ids: list of tokens that stop the generation when they are - generated. The returned output will contain the stop tokens unless - the stop tokens are special tokens. - bad_words: list of words that are not allowed to be generated. - More precisely, only the last token of a corresponding - token sequence is not allowed when the next generated token - can complete the sequence. - include_stop_str_in_output: Whether to include the stop strings in - output text. Defaults to False. - ignore_eos: Whether to ignore the EOS token and continue generating - tokens after the EOS token is generated. - max_tokens: Maximum number of tokens to generate per output sequence. - min_tokens: Minimum number of tokens to generate per output sequence - before EOS or stop_token_ids can be generated - logprobs: Number of log probabilities to return per output token. - When set to None, no probability is returned. If set to a non-None - value, the result includes the log probabilities of the specified - number of most likely tokens, as well as the chosen tokens. - Note that the implementation follows the OpenAI API: The API will - always return the log probability of the sampled token, so there - may be up to `logprobs+1` elements in the response. - When set to -1, return all `vocab_size` log probabilities. - prompt_logprobs: Number of log probabilities to return per prompt token. - detokenize: Whether to detokenize the output. Defaults to True. - skip_special_tokens: Whether to skip special tokens in the output. - spaces_between_special_tokens: Whether to add spaces between special - tokens in the output. Defaults to True. - logits_processors: list of functions that modify logits based on - previously generated tokens, and optionally prompt tokens as - a first argument. - truncate_prompt_tokens: If set to -1, will use the truncation size - supported by the model. If set to an integer k, will use only - the last k tokens from the prompt (i.e., left truncation). - Defaults to None (i.e., no truncation). - guided_decoding: If provided, the engine will construct a guided - decoding logits processor from these parameters. Defaults to None. - logit_bias: If provided, the engine will construct a logits processor - that applies these logit biases. Defaults to None. - allowed_token_ids: If provided, the engine will construct a logits - processor which only retains scores for the given token ids. - Defaults to None. - extra_args: Arbitrary additional args, that can be used by custom - sampling implementations, plugins, etc. Not used by any in-tree - sampling implementations. """ n: int = 1 + """Number of output sequences to return for the given prompt.""" best_of: Optional[int] = None + """Number of output sequences that are generated from the prompt. From + these `best_of` sequences, the top `n` sequences are returned. `best_of` + must be greater than or equal to `n`. By default, `best_of` is set to `n`. + Warning, this is only supported in V0.""" _real_n: Optional[int] = None presence_penalty: float = 0.0 + """Penalizes new tokens based on whether they appear in the generated text + so far. Values > 0 encourage the model to use new tokens, while values < 0 + encourage the model to repeat tokens.""" frequency_penalty: float = 0.0 + """Penalizes new tokens based on their frequency in the generated text so + far. Values > 0 encourage the model to use new tokens, while values < 0 + encourage the model to repeat tokens.""" repetition_penalty: float = 1.0 + """Penalizes new tokens based on whether they appear in the prompt and the + generated text so far. Values > 1 encourage the model to use new tokens, + while values < 1 encourage the model to repeat tokens.""" temperature: float = 1.0 + """Controls the randomness of the sampling. Lower values make the model + more deterministic, while higher values make the model more random. Zero + means greedy sampling.""" top_p: float = 1.0 + """Controls the cumulative probability of the top tokens to consider. Must + be in (0, 1]. Set to 1 to consider all tokens.""" top_k: int = 0 + """Controls the number of top tokens to consider. Set to 0 (or -1) to + consider all tokens.""" min_p: float = 0.0 + """Represents the minimum probability for a token to be considered, + relative to the probability of the most likely token. Must be in [0, 1]. + Set to 0 to disable this.""" seed: Optional[int] = None + """Random seed to use for the generation.""" stop: Optional[Union[str, list[str]]] = None + """String(s) that stop the generation when they are generated. The returned + output will not contain the stop strings.""" stop_token_ids: Optional[list[int]] = None + """Token IDs that stop the generation when they are generated. The returned + output will contain the stop tokens unless the stop tokens are special + tokens.""" ignore_eos: bool = False + """Whether to ignore the EOS token and continue generating + tokens after the EOS token is generated.""" max_tokens: Optional[int] = 16 + """Maximum number of tokens to generate per output sequence.""" min_tokens: int = 0 + """Minimum number of tokens to generate per output sequence before EOS or + `stop_token_ids` can be generated""" logprobs: Optional[int] = None + """Number of log probabilities to return per output token. When set to + `None`, no probability is returned. If set to a non-`None` value, the + result includes the log probabilities of the specified number of most + likely tokens, as well as the chosen tokens. Note that the implementation + follows the OpenAI API: The API will always return the log probability of + the sampled token, so there may be up to `logprobs+1` elements in the + response. When set to -1, return all `vocab_size` log probabilities.""" prompt_logprobs: Optional[int] = None + """Number of log probabilities to return per prompt token.""" # NOTE: This parameter is only exposed at the engine level for now. # It is not exposed in the OpenAI API server, as the OpenAI API does # not support returning only a list of token IDs. detokenize: bool = True + """Whether to detokenize the output.""" skip_special_tokens: bool = True + """Whether to skip special tokens in the output.""" spaces_between_special_tokens: bool = True + """Whether to add spaces between special tokens in the output.""" # Optional[list[LogitsProcessor]] type. We use Any here because # Optional[list[LogitsProcessor]] type is not supported by msgspec. logits_processors: Optional[Any] = None + """Functions that modify logits based on previously generated tokens, and + optionally prompt tokens as a first argument.""" include_stop_str_in_output: bool = False + """Whether to include the stop strings in output text.""" truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None + """If set to -1, will use the truncation size supported by the model. If + set to an integer k, will use only the last k tokens from the prompt + (i.e., left truncation). If set to `None`, truncation is disabled.""" output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE # The below fields are not supposed to be used as an input. @@ -219,12 +195,24 @@ class SamplingParams( # Fields used to construct logits processors guided_decoding: Optional[GuidedDecodingParams] = None + """If provided, the engine will construct a guided decoding logits + processor from these parameters.""" logit_bias: Optional[dict[int, float]] = None + """If provided, the engine will construct a logits processor that applies + these logit biases.""" allowed_token_ids: Optional[list[int]] = None + """If provided, the engine will construct a logits processor which only + retains scores for the given token ids.""" extra_args: Optional[dict[str, Any]] = None + """Arbitrary additional args, that can be used by custom sampling + implementations, plugins, etc. Not used by any in-tree sampling + implementations.""" # Fields used for bad words bad_words: Optional[list[str]] = None + """Words that are not allowed to be generated. More precisely, only the + last token of a corresponding token sequence is not allowed when the next + generated token can complete the sequence.""" _bad_words_token_ids: Optional[list[list[int]]] = None @staticmethod From 049c245143ef0f8fd338fc3200f51a18fc53b403 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 10 Aug 2025 21:18:21 +0800 Subject: [PATCH 139/932] [Misc] Replace flaky image urls in pixtral test (#22574) Signed-off-by: Isotr0py Signed-off-by: Isotr0py <2037008807@qq.com> --- .../multimodal/generation/test_pixtral.py | 24 +++++++++---------- tests/models/utils.py | 3 ++- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py index e157d6f4a7..d39cf70678 100644 --- a/tests/models/multimodal/generation/test_pixtral.py +++ b/tests/models/multimodal/generation/test_pixtral.py @@ -18,7 +18,7 @@ from vllm.multimodal.inputs import PlaceholderRange from vllm.sequence import Logprob, SampleLogprobs from ....utils import VLLM_PATH, large_gpu_test -from ...utils import check_logprobs_close +from ...utils import check_logprobs_close, dummy_hf_overrides if TYPE_CHECKING: from _typeshed import StrPath @@ -29,10 +29,10 @@ MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID] IMG_URLS = [ - "https://picsum.photos/id/237/400/300", - "https://picsum.photos/id/231/200/300", - "https://picsum.photos/id/27/500/500", - "https://picsum.photos/id/17/150/600", + "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg", + "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/231-200x300.jpg", + "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/27-500x500.jpg", + "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/17-150x600.jpg", ] PROMPT = "Describe each image in one short sentence." @@ -110,11 +110,6 @@ MSGS = [ _create_msg_format(IMG_URLS[:2]), _create_msg_format(IMG_URLS), ] -ENGINE_INPUTS = [ - _create_engine_inputs(IMG_URLS[:1]), - _create_engine_inputs(IMG_URLS[:2]), - _create_engine_inputs(IMG_URLS), -] SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5) LIMIT_MM_PER_PROMPT = dict(image=4) @@ -195,7 +190,6 @@ def test_chat( name_1="output") -@large_gpu_test(min_gb=48) @pytest.mark.parametrize("prompt,expected_ranges", [(_create_engine_inputs_hf(IMG_URLS[:1]), [PlaceholderRange(offset=11, length=494)]), @@ -204,7 +198,7 @@ def test_chat( PlaceholderRange(offset=277, length=1056), PlaceholderRange(offset=1333, length=418) ])]) -def test_multi_modal_placeholders(vllm_runner, prompt, +def test_multi_modal_placeholders(vllm_runner, prompt: TextPrompt, expected_ranges: list[PlaceholderRange], monkeypatch) -> None: @@ -215,6 +209,8 @@ def test_multi_modal_placeholders(vllm_runner, prompt, "mistral-community/pixtral-12b", max_model_len=8192, limit_mm_per_prompt=LIMIT_MM_PER_PROMPT, + load_format="dummy", + hf_overrides=dummy_hf_overrides, ) as vllm_model: outputs = vllm_model.llm.generate(prompt) @@ -230,5 +226,7 @@ def test_multi_modal_placeholders(vllm_runner, prompt, expected_ranges), f"{image_placeholder_ranges=}" for real_range, expected_range in zip(image_placeholder_ranges, expected_ranges): - assert real_range == expected_range, \ + assert real_range.offset == expected_range.offset, \ + f"{real_range=} {expected_range=}" + assert real_range.length == expected_range.length, \ f"{real_range=} {expected_range=}" diff --git a/tests/models/utils.py b/tests/models/utils.py index 1e3d51aeec..11ddf45c8e 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -357,7 +357,8 @@ class RerankModelInfo(NamedTuple): def dummy_hf_overrides( hf_config: PretrainedConfig, - model_arch: str, + *, + model_arch: str = "", exist_overrides: Optional[dict[str, Any]] = None, ) -> PretrainedConfig: """ From 8290d15d2c6a4a82e4fd0af86b352aa522178a68 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sun, 10 Aug 2025 15:36:40 +0100 Subject: [PATCH 140/932] Move `CacheConfig` from `config/__init__.py` to `config/cache.py` (#22586) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/__init__.py | 190 +------------------------------------ vllm/config/cache.py | 204 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 208 insertions(+), 186 deletions(-) create mode 100644 vllm/config/cache.py diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 49da3fd848..700d29f956 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -29,6 +29,8 @@ from typing_extensions import Self, assert_never, runtime_checkable import vllm.envs as envs from vllm import version +from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, + PrefixCachingHashAlgo) from vllm.config.compilation import (CompilationConfig, CompilationLevel, PassConfig) from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig @@ -49,9 +51,8 @@ from vllm.transformers_utils.utils import is_s3, maybe_model_redirect # yapf: disable from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, - POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes, - LayerBlockType, LazyLoader, common_broadcastable_dtype, - get_cpu_memory, random_uuid) + POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, LayerBlockType, + LazyLoader, common_broadcastable_dtype, random_uuid) # yapf: enable @@ -1731,189 +1732,6 @@ class ModelConfig: return max_model_len -BlockSize = Literal[1, 8, 16, 32, 64, 128] -CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"] -PrefixCachingHashAlgo = Literal["builtin", "sha256", "sha256_cbor_64bit"] - - -@config -@dataclass -class CacheConfig: - """Configuration for the KV cache.""" - - block_size: SkipValidation[BlockSize] = None # type: ignore - """Size of a contiguous cache block in number of tokens. This is ignored on - neuron devices and set to `--max-model-len`. On CUDA devices, only block - sizes up to 32 are supported. On HPU devices, block size defaults to 128. - - This config has no static default. If left unspecified by the user, it will - be set in `Platform.check_and_update_config()` based on the current - platform.""" - gpu_memory_utilization: float = 0.9 - """The fraction of GPU memory to be used for the model executor, which can - range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory - utilization. If unspecified, will use the default value of 0.9. This is a - per-instance limit, and only applies to the current vLLM instance. It does - not matter if you have another vLLM instance running on the same GPU. For - example, if you have two vLLM instances running on the same GPU, you can - set the GPU memory utilization to 0.5 for each instance.""" - swap_space: float = 4 - """Size of the CPU swap space per GPU (in GiB).""" - cache_dtype: CacheDType = "auto" - """Data type for kv cache storage. If "auto", will use model data type. - CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports - fp8 (=fp8_e4m3). Intel Gaudi (HPU) supports fp8 (using fp8_inc).""" - is_attention_free: bool = False - """Whether the model is attention-free. This is primarily set in - `ModelConfig` and that value should be manually duplicated here.""" - num_gpu_blocks_override: Optional[int] = None - """Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks` - if specified. Does nothing if `None`. Used for testing preemption.""" - sliding_window: Optional[int] = None - """Sliding window size for the KV cache. This is primarily set in - `ModelConfig` and that value should be manually duplicated here.""" - enable_prefix_caching: Optional[bool] = None - """Whether to enable prefix caching. Disabled by default for V0. Enabled by - default for V1.""" - prefix_caching_hash_algo: PrefixCachingHashAlgo = "builtin" - """Set the hash algorithm for prefix caching:\n - - "builtin" is Python's built-in hash.\n - - "sha256" is collision resistant but with certain overheads. - This option uses Pickle for object serialization before hashing.\n - - "sha256_cbor_64bit" provides a reproducible, cross-language compatible - hash. It serializes objects using canonical CBOR and hashes them with - SHA-256. The resulting hash consists of the lower 64 bits of the SHA-256 - digest.""" - cpu_offload_gb: float = 0 - """The space in GiB to offload to CPU, per GPU. Default is 0, which means - no offloading. Intuitively, this argument can be seen as a virtual way to - increase the GPU memory size. For example, if you have one 24 GB GPU and - set this to 10, virtually you can think of it as a 34 GB GPU. Then you can - load a 13B model with BF16 weight, which requires at least 26GB GPU memory. - Note that this requires fast CPU-GPU interconnect, as part of the model is - loaded from CPU memory to GPU memory on the fly in each model forward pass. - """ - calculate_kv_scales: bool = False - """This enables dynamic calculation of `k_scale` and `v_scale` when - kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model - checkpoint if available. Otherwise, the scales will default to 1.0.""" - cpu_kvcache_space_bytes: Optional[int] = None - """(CPU backend only) CPU key-value cache space.""" - mamba_page_size_padded: Optional[int] = None - """ Optional override for mamba page size; used by hybrid mamba/attention - models to ensure exact alignment with attention page size.""" - - # Will be set after profiling. - num_gpu_blocks: Optional[int] = field(default=None, init=False) - """The number of blocks to allocate for GPU memory.""" - num_cpu_blocks: Optional[int] = field(default=None, init=False) - """The number of blocks to allocate for CPU memory.""" - - kv_sharing_fast_prefill: bool = False - """This feature is work in progress and no prefill optimization takes place - with this flag enabled currently. - - In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254), - some layers can skip tokens corresponding to prefill. This flag enables - attention metadata for eligible layers to be overriden with metadata - necessary for implementating this optimization in some models (e.g. Gemma3n) - """ - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - factors: list[Any] = [] - factors.append(self.cache_dtype) - # `cpu_offload_gb` does not use `torch.compile` yet. - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() - return hash_str - - def __post_init__(self) -> None: - self.swap_space_bytes = self.swap_space * GiB_bytes - - self._verify_cache_dtype() - self._verify_prefix_caching() - - def metrics_info(self): - # convert cache_config to dict(key: str, value: str) for prometheus - # metrics info - return {key: str(value) for key, value in self.__dict__.items()} - - @model_validator(mode='after') - def _verify_args(self) -> Self: - if self.cpu_offload_gb < 0: - raise ValueError("CPU offload space must be non-negative" - f", but got {self.cpu_offload_gb}") - - if self.gpu_memory_utilization > 1.0: - raise ValueError( - "GPU memory utilization must be less than 1.0. Got " - f"{self.gpu_memory_utilization}.") - - if self.kv_sharing_fast_prefill: - logger.warning_once( - "--kv-sharing-fast-prefill is currently work in progress " - "and not functional yet (i.e. no prefill savings)") - - return self - - def _verify_cache_dtype(self) -> None: - if self.cache_dtype == "auto": - pass - elif self.cache_dtype in get_args(CacheDType): - logger.info( - "Using fp8 data type to store kv cache. It reduces the GPU " - "memory footprint and boosts the performance. " - "Meanwhile, it may cause accuracy drop without a proper " - "scaling factor.") - else: - raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") - - def _verify_prefix_caching(self) -> None: - if not self.enable_prefix_caching: - return - - if self.sliding_window is not None and not envs.VLLM_USE_V1: - raise NotImplementedError( - "Prefix caching is not supported with sliding window. " - "Run with --disable-sliding-window to use prefix caching.") - - if (self.enable_prefix_caching and self.prefix_caching_hash_algo - not in get_args(PrefixCachingHashAlgo)): - raise ValueError( - "Unknown prefix caching hash algorithm: " - f"{self.prefix_caching_hash_algo}. Must be one of " - f"{get_args(PrefixCachingHashAlgo)}.") - - def verify_with_parallel_config( - self, - parallel_config: "ParallelConfig", - ) -> None: - total_cpu_memory = get_cpu_memory() - # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel - # group are in the same node. However, the GPUs may span multiple nodes. - num_gpus_per_node = parallel_config.tensor_parallel_size - cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node - - msg = (f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the " - f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory " - "is allocated for the swap space.") - if cpu_memory_usage > 0.7 * total_cpu_memory: - raise ValueError("Too large swap space. " + msg) - elif cpu_memory_usage > 0.4 * total_cpu_memory: - logger.warning("Possibly too large swap space. %s", msg) - - @config @dataclass class LoadConfig: diff --git a/vllm/config/cache.py b/vllm/config/cache.py new file mode 100644 index 0000000000..69cb0d9732 --- /dev/null +++ b/vllm/config/cache.py @@ -0,0 +1,204 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from dataclasses import field +from typing import TYPE_CHECKING, Any, Literal, Optional, get_args + +from pydantic import SkipValidation, model_validator +from pydantic.dataclasses import dataclass +from typing_extensions import Self + +import vllm.envs as envs +from vllm.config.utils import config +from vllm.logger import init_logger +from vllm.utils import GiB_bytes, get_cpu_memory + +if TYPE_CHECKING: + from vllm.config.parallel import ParallelConfig +else: + ParallelConfig = Any + +logger = init_logger(__name__) + +BlockSize = Literal[1, 8, 16, 32, 64, 128] +CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"] +PrefixCachingHashAlgo = Literal["builtin", "sha256", "sha256_cbor_64bit"] + + +@config +@dataclass +class CacheConfig: + """Configuration for the KV cache.""" + + block_size: SkipValidation[BlockSize] = None # type: ignore + """Size of a contiguous cache block in number of tokens. This is ignored on + neuron devices and set to `--max-model-len`. On CUDA devices, only block + sizes up to 32 are supported. On HPU devices, block size defaults to 128. + + This config has no static default. If left unspecified by the user, it will + be set in `Platform.check_and_update_config()` based on the current + platform.""" + gpu_memory_utilization: float = 0.9 + """The fraction of GPU memory to be used for the model executor, which can + range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory + utilization. If unspecified, will use the default value of 0.9. This is a + per-instance limit, and only applies to the current vLLM instance. It does + not matter if you have another vLLM instance running on the same GPU. For + example, if you have two vLLM instances running on the same GPU, you can + set the GPU memory utilization to 0.5 for each instance.""" + swap_space: float = 4 + """Size of the CPU swap space per GPU (in GiB).""" + cache_dtype: CacheDType = "auto" + """Data type for kv cache storage. If "auto", will use model data type. + CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports + fp8 (=fp8_e4m3). Intel Gaudi (HPU) supports fp8 (using fp8_inc).""" + is_attention_free: bool = False + """Whether the model is attention-free. This is primarily set in + `ModelConfig` and that value should be manually duplicated here.""" + num_gpu_blocks_override: Optional[int] = None + """Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks` + if specified. Does nothing if `None`. Used for testing preemption.""" + sliding_window: Optional[int] = None + """Sliding window size for the KV cache. This is primarily set in + `ModelConfig` and that value should be manually duplicated here.""" + enable_prefix_caching: Optional[bool] = None + """Whether to enable prefix caching. Disabled by default for V0. Enabled by + default for V1.""" + prefix_caching_hash_algo: PrefixCachingHashAlgo = "builtin" + """Set the hash algorithm for prefix caching:\n + - "builtin" is Python's built-in hash.\n + - "sha256" is collision resistant but with certain overheads. + This option uses Pickle for object serialization before hashing.\n + - "sha256_cbor_64bit" provides a reproducible, cross-language compatible + hash. It serializes objects using canonical CBOR and hashes them with + SHA-256. The resulting hash consists of the lower 64 bits of the SHA-256 + digest.""" + cpu_offload_gb: float = 0 + """The space in GiB to offload to CPU, per GPU. Default is 0, which means + no offloading. Intuitively, this argument can be seen as a virtual way to + increase the GPU memory size. For example, if you have one 24 GB GPU and + set this to 10, virtually you can think of it as a 34 GB GPU. Then you can + load a 13B model with BF16 weight, which requires at least 26GB GPU memory. + Note that this requires fast CPU-GPU interconnect, as part of the model is + loaded from CPU memory to GPU memory on the fly in each model forward pass. + """ + calculate_kv_scales: bool = False + """This enables dynamic calculation of `k_scale` and `v_scale` when + kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model + checkpoint if available. Otherwise, the scales will default to 1.0.""" + cpu_kvcache_space_bytes: Optional[int] = None + """(CPU backend only) CPU key-value cache space.""" + mamba_page_size_padded: Optional[int] = None + """ Optional override for mamba page size; used by hybrid mamba/attention + models to ensure exact alignment with attention page size.""" + + # Will be set after profiling. + num_gpu_blocks: Optional[int] = field(default=None, init=False) + """The number of blocks to allocate for GPU memory.""" + num_cpu_blocks: Optional[int] = field(default=None, init=False) + """The number of blocks to allocate for CPU memory.""" + + kv_sharing_fast_prefill: bool = False + """This feature is work in progress and no prefill optimization takes place + with this flag enabled currently. + + In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254), + some layers can skip tokens corresponding to prefill. This flag enables + attention metadata for eligible layers to be overriden with metadata + necessary for implementating this optimization in some models (e.g. Gemma3n) + """ + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: list[Any] = [] + factors.append(self.cache_dtype) + # `cpu_offload_gb` does not use `torch.compile` yet. + hash_str = hashlib.md5(str(factors).encode(), + usedforsecurity=False).hexdigest() + return hash_str + + def __post_init__(self) -> None: + self.swap_space_bytes = self.swap_space * GiB_bytes + + self._verify_cache_dtype() + self._verify_prefix_caching() + + def metrics_info(self): + # convert cache_config to dict(key: str, value: str) for prometheus + # metrics info + return {key: str(value) for key, value in self.__dict__.items()} + + @model_validator(mode='after') + def _verify_args(self) -> Self: + if self.cpu_offload_gb < 0: + raise ValueError("CPU offload space must be non-negative" + f", but got {self.cpu_offload_gb}") + + if self.gpu_memory_utilization > 1.0: + raise ValueError( + "GPU memory utilization must be less than 1.0. Got " + f"{self.gpu_memory_utilization}.") + + if self.kv_sharing_fast_prefill: + logger.warning_once( + "--kv-sharing-fast-prefill is currently work in progress " + "and not functional yet (i.e. no prefill savings)") + + return self + + def _verify_cache_dtype(self) -> None: + if self.cache_dtype == "auto": + pass + elif self.cache_dtype in get_args(CacheDType): + logger.info( + "Using fp8 data type to store kv cache. It reduces the GPU " + "memory footprint and boosts the performance. " + "Meanwhile, it may cause accuracy drop without a proper " + "scaling factor.") + else: + raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") + + def _verify_prefix_caching(self) -> None: + if not self.enable_prefix_caching: + return + + if self.sliding_window is not None and not envs.VLLM_USE_V1: + raise NotImplementedError( + "Prefix caching is not supported with sliding window. " + "Run with --disable-sliding-window to use prefix caching.") + + if (self.enable_prefix_caching and self.prefix_caching_hash_algo + not in get_args(PrefixCachingHashAlgo)): + raise ValueError( + "Unknown prefix caching hash algorithm: " + f"{self.prefix_caching_hash_algo}. Must be one of " + f"{get_args(PrefixCachingHashAlgo)}.") + + def verify_with_parallel_config( + self, + parallel_config: ParallelConfig, + ) -> None: + total_cpu_memory = get_cpu_memory() + # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel + # group are in the same node. However, the GPUs may span multiple nodes. + num_gpus_per_node = parallel_config.tensor_parallel_size + cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node + + msg = (f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the " + f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory " + "is allocated for the swap space.") + if cpu_memory_usage > 0.7 * total_cpu_memory: + raise ValueError("Too large swap space. " + msg) + elif cpu_memory_usage > 0.4 * total_cpu_memory: + logger.warning("Possibly too large swap space. %s", msg) From 0757551c96fa97a4f8c0f06519e5b296171a08f1 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 10 Aug 2025 22:51:36 +0800 Subject: [PATCH 141/932] [doc] add beijing meetup links (#22596) Signed-off-by: youkaichao --- README.md | 3 ++- docs/community/meetups.md | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5348405b72..a65d4803fa 100644 --- a/README.md +++ b/README.md @@ -18,14 +18,15 @@ Easy, fast, and cheap LLM serving for everyone *Latest News* 🔥 +- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152). - [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing). - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/). -- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing). - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).

Previous News +- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing). - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing). - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing). - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0). diff --git a/docs/community/meetups.md b/docs/community/meetups.md index e8b3a9c9c8..36232e6ad9 100644 --- a/docs/community/meetups.md +++ b/docs/community/meetups.md @@ -2,6 +2,7 @@ We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +- [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152). - [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing) - [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing). - [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing). From b81fe83b2cfa061cb0f9cd88da9c88f22529f284 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 10 Aug 2025 23:13:47 +0800 Subject: [PATCH 142/932] [doc] add alibaba cloud as sponsor (#22597) Signed-off-by: youkaichao --- README.md | 1 + docs/community/sponsors.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index a65d4803fa..d9e3ca660f 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,7 @@ Cash Donations: Compute Resources: +- Alibaba Cloud - AMD - Anyscale - AWS diff --git a/docs/community/sponsors.md b/docs/community/sponsors.md index b8a1ddbe38..6ad3a66252 100644 --- a/docs/community/sponsors.md +++ b/docs/community/sponsors.md @@ -15,6 +15,7 @@ Cash Donations: Compute Resources: +- Alibaba Cloud - AMD - Anyscale - AWS From b76753f0b58a070f626549115d1414ec421e7e49 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Mon, 11 Aug 2025 00:00:36 +0800 Subject: [PATCH 143/932] [Bugfix][Kernel] Support partial rotary embedding for MRoPE triton kernel (#22593) Signed-off-by: Isotr0py --- tests/kernels/{ => core}/test_mrope.py | 20 +++++++++---- .../layers/rotary_embedding/mrope.py | 28 +++++++++++-------- 2 files changed, 30 insertions(+), 18 deletions(-) rename tests/kernels/{ => core}/test_mrope.py (92%) diff --git a/tests/kernels/test_mrope.py b/tests/kernels/core/test_mrope.py similarity index 92% rename from tests/kernels/test_mrope.py rename to tests/kernels/core/test_mrope.py index 5918b7a58b..3f2f330f6d 100644 --- a/tests/kernels/test_mrope.py +++ b/tests/kernels/core/test_mrope.py @@ -42,12 +42,13 @@ def unroll_model_tp_dict(model_tp_dict): model_tp_dict = { "Qwen/Qwen2-VL-7B-Instruct": [1, 2], "Qwen/Qwen2-VL-72B-Instruct": [1, 2], - "Qwen/Qwen2.5-VL-72B-Instruct": [1, 2] + "Qwen/Qwen2.5-VL-72B-Instruct": [1, 2], + "zai-org/GLM-4.1V-9B-Thinking": [1, 2], } # https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317 dtype_atol_rtol_list = [ - [torch.bfloat16, 1e-5, 1.6e-2], + [torch.bfloat16, 1e-2, 1.6e-2], ] num_tokens_list = [11, 8192] @@ -73,10 +74,12 @@ def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens): rope_theta = config.rope_theta max_position = config.max_position_embeddings + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + rotary_dim = int(head_dim * partial_rotary_factor) mrope_helper_class = get_rope( head_size=head_dim, - rotary_dim=head_dim, + rotary_dim=rotary_dim, max_position=max_position, base=rope_theta, is_neox_style=is_neox_style, @@ -110,7 +113,10 @@ def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens): reason="Skipping CUDA/ROCm only tests.") @pytest.mark.parametrize( "model_name, tp_size", - unroll_model_tp_dict({"Qwen/Qwen2-VL-7B-Instruct": [1, 2]})) + unroll_model_tp_dict({ + "Qwen/Qwen2-VL-7B-Instruct": [1, 2], + "zai-org/GLM-4.1V-9B-Thinking": [1, 2] + })) @pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list) @pytest.mark.parametrize("num_tokens", [4]) def test_mrope_torch_compile_tracing(model_name, tp_size, dtype, atol, rtol, @@ -126,10 +132,12 @@ def test_mrope_torch_compile_tracing(model_name, tp_size, dtype, atol, rtol, is_neox_style = True rope_theta = config.rope_theta max_position = config.max_position_embeddings + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + rotary_dim = int(head_dim * partial_rotary_factor) mrope_helper_class = get_rope( head_size=head_dim, - rotary_dim=head_dim, + rotary_dim=rotary_dim, max_position=max_position, base=rope_theta, is_neox_style=is_neox_style, @@ -145,7 +153,7 @@ def test_mrope_torch_compile_tracing(model_name, tp_size, dtype, atol, rtol, # Create a wrapper that makes the in-place function appear functional def functional_forward_cuda(pos, q, k): """Wrapper that converts in-place operation to functional style - + CUDA Graph does not support in-place operations. This wrapper creates working copies of the input tensors and modifies them. diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index d3b71930b6..a091cfb743 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -25,6 +25,7 @@ def _triton_qwen2vl_mrope_forward( n_qh: tl.constexpr, n_kh: tl.constexpr, hd: tl.constexpr, + rd: tl.constexpr, pad_n_qh: tl.constexpr, pad_n_kh: tl.constexpr, pad_hd: tl.constexpr, @@ -51,19 +52,19 @@ def _triton_qwen2vl_mrope_forward( h_end = t_end + mrope_section_h # Updated stride calculation for half head_dim - half_hd = hd // 2 - t_cos = cos + pid * half_hd - h_cos = t_cos + num_tokens * half_hd - w_cos = h_cos + num_tokens * half_hd - t_sin = sin + pid * half_hd - h_sin = t_sin + num_tokens * half_hd - w_sin = h_sin + num_tokens * half_hd + half_rd = rd // 2 + t_cos = cos + pid * half_rd + h_cos = t_cos + num_tokens * half_rd + w_cos = h_cos + num_tokens * half_rd + t_sin = sin + pid * half_rd + h_sin = t_sin + num_tokens * half_rd + w_sin = h_sin + num_tokens * half_rd # Updated offsets for half head_dim cos_offsets = tl.arange(0, pad_hd // 2) t_mask = cos_offsets < t_end h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end) - w_mask = (h_end <= cos_offsets) & (cos_offsets < half_hd) + w_mask = (h_end <= cos_offsets) & (cos_offsets < half_rd) t_cos_row = tl.load(t_cos + cos_offsets, mask=t_mask, other=0) h_cos_row = tl.load(h_cos + cos_offsets, mask=h_mask, other=0) @@ -85,9 +86,9 @@ def _triton_qwen2vl_mrope_forward( first_half_k_offsets = tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange( 0, pad_hd // 2)[None, :] first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (tl.arange( - 0, pad_hd // 2)[None, :] < hd // 2) + 0, pad_hd // 2)[None, :] < rd // 2) first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (tl.arange( - 0, pad_hd // 2)[None, :] < hd // 2) + 0, pad_hd // 2)[None, :] < rd // 2) q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, @@ -97,8 +98,8 @@ def _triton_qwen2vl_mrope_forward( other=0).to(sin_row.dtype) # right half of the head - second_half_q_offsets = first_half_q_offsets + (hd // 2) - second_half_k_offsets = first_half_k_offsets + (hd // 2) + second_half_q_offsets = first_half_q_offsets + (rd // 2) + second_half_k_offsets = first_half_k_offsets + (rd // 2) second_q_mask = first_q_mask second_k_mask = first_k_mask @@ -130,6 +131,7 @@ def triton_mrope( sin: torch.Tensor, mrope_section: list[int], head_size: int, + rotary_dim: int, ) -> tuple[torch.Tensor, torch.Tensor]: """Qwen2VL mrope kernel. @@ -166,6 +168,7 @@ def triton_mrope( n_q_head, n_kv_head, head_size, + rotary_dim, pad_n_q_head, pad_n_kv_head, pad_hd, @@ -300,6 +303,7 @@ class MRotaryEmbedding(RotaryEmbedding): sin, self.mrope_section, self.head_size, + self.rotary_dim, ) return q.reshape(query_shape), k.reshape(key_shape) From 65a7917be480c1b0e45f12bfad31eb4b25539db9 Mon Sep 17 00:00:00 2001 From: Breno Baldas Skuk Date: Sun, 10 Aug 2025 18:03:15 +0200 Subject: [PATCH 144/932] Fix(benchmarks): allow multiple mm contents in OpenAI Chat Completion Benchmarks (#22534) Signed-off-by: breno.skuk --- benchmarks/backend_request_func.py | 17 ++++++++++++++--- benchmarks/benchmark_dataset.py | 2 +- benchmarks/benchmark_serving.py | 9 ++++++++- vllm/benchmarks/datasets.py | 4 +++- vllm/benchmarks/lib/endpoint_request_func.py | 18 +++++++++++++++--- vllm/benchmarks/serve.py | 9 ++++++++- 6 files changed, 49 insertions(+), 10 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index c7229dbb8e..1559ca2d92 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -31,7 +31,7 @@ class RequestFuncInput: model_name: Optional[str] = None logprobs: Optional[int] = None extra_body: Optional[dict] = None - multi_modal_content: Optional[dict] = None + multi_modal_content: Optional[dict | list[dict]] = None ignore_eos: bool = False language: Optional[str] = None @@ -364,7 +364,15 @@ async def async_request_openai_chat_completions( ) as session: content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: - content.append(request_func_input.multi_modal_content) + mm_content = request_func_input.multi_modal_content + if isinstance(mm_content, list): + content.extend(mm_content) + elif isinstance(mm_content, dict): + content.append(mm_content) + else: + raise TypeError( + "multi_modal_content must be a dict or list[dict] for openai-chat" + ) payload = { "model": request_func_input.model_name if request_func_input.model_name @@ -491,7 +499,10 @@ async def async_request_openai_audio( buffer.seek(0) return buffer - with to_bytes(*request_func_input.multi_modal_content["audio"]) as f: + mm_audio = request_func_input.multi_modal_content + if not isinstance(mm_audio, dict) or "audio" not in mm_audio: + raise TypeError("multi_modal_content must be a dict containing 'audio'") + with to_bytes(*mm_audio["audio"]) as f: form = aiohttp.FormData() form.add_field("file", f, content_type="audio/wav") for key, value in payload.items(): diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 1ad6cef7a9..ea684f18a7 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -52,7 +52,7 @@ class SampleRequest: prompt: Union[str, Any] prompt_len: int expected_output_len: int - multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None + multi_modal_data: Optional[Union[MultiModalDataDict, dict, list[dict]]] = None lora_request: Optional[LoRARequest] = None diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 93b72211eb..ae38caf729 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -263,7 +263,14 @@ async def benchmark( input_requests[0].multi_modal_data, ) - assert test_mm_content is None or isinstance(test_mm_content, dict) + assert ( + test_mm_content is None + or isinstance(test_mm_content, dict) + or ( + isinstance(test_mm_content, list) + and all(isinstance(item, dict) for item in test_mm_content) + ) + ), "multi_modal_data must be a dict or list[dict]" test_input = RequestFuncInput( model=model_id, model_name=model_name, diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 45b58035eb..4e8ac51625 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -71,7 +71,9 @@ class SampleRequest: prompt: Union[str, Any] prompt_len: int expected_output_len: int - multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None + multi_modal_data: Optional[ + Union[MultiModalDataDict, dict, list[dict]] + ] = None lora_request: Optional[LoRARequest] = None diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index 2d64cc115f..47bc288774 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -28,7 +28,7 @@ class RequestFuncInput: model_name: Optional[str] = None logprobs: Optional[int] = None extra_body: Optional[dict] = None - multi_modal_content: Optional[dict] = None + multi_modal_content: Optional[dict | list[dict]] = None ignore_eos: bool = False language: Optional[str] = None @@ -172,7 +172,16 @@ async def async_request_openai_chat_completions( content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: - content.append(request_func_input.multi_modal_content) + mm_content = request_func_input.multi_modal_content + if isinstance(mm_content, list): + content.extend(mm_content) + elif isinstance(mm_content, dict): + content.append(mm_content) + else: + raise TypeError( + "multi_modal_content must be a dict or list[dict] " + "for openai-chat" + ) payload = { "model": request_func_input.model_name @@ -310,7 +319,10 @@ async def async_request_openai_audio( buffer.seek(0) return buffer - with to_bytes(*request_func_input.multi_modal_content["audio"]) as f: + mm_audio = request_func_input.multi_modal_content + if not isinstance(mm_audio, dict) or "audio" not in mm_audio: + raise TypeError("multi_modal_content must be a dict containing 'audio'") + with to_bytes(*mm_audio["audio"]) as f: form = aiohttp.FormData() form.add_field("file", f, content_type="audio/wav") for key, value in payload.items(): diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 7cdf87cb4c..7bf04c7532 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -365,7 +365,14 @@ async def benchmark( input_requests[0].multi_modal_data, ) - assert test_mm_content is None or isinstance(test_mm_content, dict) + assert ( + test_mm_content is None + or isinstance(test_mm_content, dict) + or ( + isinstance(test_mm_content, list) + and all(isinstance(item, dict) for item in test_mm_content) + ) + ), "multi_modal_data must be a dict or list[dict]" test_input = RequestFuncInput( model=model_id, model_name=model_name, From b4e2916721463b43f3b06ccc980050dfb37b615a Mon Sep 17 00:00:00 2001 From: Benji Beck Date: Sun, 10 Aug 2025 09:05:21 -0700 Subject: [PATCH 145/932] Migrate LlavaNextImageInputs to TensorSchema (#21774) Signed-off-by: Benji Beck Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/models/llava_next.py | 96 ++++++++---------------- vllm/utils/tensor_schema.py | 3 + 2 files changed, 35 insertions(+), 64 deletions(-) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 04fb6b5736..a63c18493d 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -3,7 +3,7 @@ from abc import abstractmethod from collections.abc import Iterable, Mapping -from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, +from typing import (Annotated, Final, Literal, Optional, Protocol, TypeVar, Union) import torch @@ -11,7 +11,6 @@ import torch.nn as nn from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor from transformers.models.llava_next.modeling_llava_next import ( get_anyres_image_grid_shape, unpad_image) -from typing_extensions import NotRequired from vllm.config import VllmConfig from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -19,6 +18,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFieldConfig from vllm.multimodal.parse import ImageSize from vllm.sequence import IntermediateTensors +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .clip import CLIPVisionModel from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP @@ -30,32 +30,36 @@ from .utils import (AutoWeightsLoader, WeightsMapper, embed_multimodal, flatten_bn, init_vllm_registered_model, maybe_prefix) -class LlavaNextImagePixelInputs(TypedDict): - type: Literal["pixel_values"] - pixel_values: Union[torch.Tensor, list[torch.Tensor]] +class LlavaNextImagePixelInputs(TensorSchema): """ - Shape: - `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` - + Dimensions: + - bn: Batch size * number of images + - np: Number of patches + 1 + - c: Number of channels (3) + - h: Height + - w: Width + Note that `num_patches` may be different per batch and image, in which case the data is passed as a list instead of a batched tensor. """ + type: Literal["pixel_values"] = "pixel_values" + pixel_values: Annotated[ + Union[torch.Tensor, list[torch.Tensor]], + TensorShape("bn", "np", 3, "h", "w", dynamic_dims={"np"})] - image_sizes: NotRequired[torch.Tensor] + image_sizes: Annotated[Optional[torch.Tensor], TensorShape("bn", 2)] + # This should be in `(height, width)` format. + + +class LlavaNextImageEmbeddingInputs(TensorSchema): """ - Shape: `(batch_size * num_images, 2)` - - This should be in `(height, width)` format. - """ - - -class LlavaNextImageEmbeddingInputs(TypedDict): - type: Literal["image_embeds"] - data: torch.Tensor - """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` - - `hidden_size` must match the hidden size of language model backbone. + Dimensions: + - bn: Batch size * number of images + - ifs: Image feature size + - hs: Hidden size (must match language model backbone) """ + type: Literal["image_embeds"] = "image_embeds" + data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")] LlavaNextImageInputs = Union[LlavaNextImagePixelInputs, @@ -269,44 +273,6 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor: - expected_dims = (2, ) - - def _validate_shape(d: torch.Tensor): - actual_dims = tuple(d.shape) - - if actual_dims != expected_dims: - expected_expr = str(expected_dims) - raise ValueError( - f"The expected shape of image sizes per image per batch " - f"is {expected_expr}. You supplied {tuple(d.shape)}.") - - for d in data: - _validate_shape(d) - - return data - - def _validate_pixel_values( - self, data: Union[torch.Tensor, list[torch.Tensor]] - ) -> Union[torch.Tensor, list[torch.Tensor]]: - - h = w = self.config.vision_config.image_size - expected_dims = (3, h, w) - - def _validate_shape(d: torch.Tensor): - actual_dims = tuple(d.shape[1:]) - - if actual_dims != expected_dims: - expected_expr = ("num_patches", *map(str, expected_dims)) - raise ValueError( - "The expected shape of pixel values per image per batch " - f"is {expected_expr}. You supplied {tuple(d.shape)}.") - - for d in data: - _validate_shape(d) - - return data - def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[LlavaNextImageInputs]: pixel_values = kwargs.pop("pixel_values", None) @@ -325,13 +291,15 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, raise ValueError("Incorrect type of image sizes. " f"Got type: {type(image_sizes)}") + expected_h = expected_w = self.config.vision_config.image_size return LlavaNextImagePixelInputs( type="pixel_values", - pixel_values=self._validate_pixel_values( - flatten_bn(pixel_values)), - image_sizes=self._validate_image_sizes( - flatten_bn(image_sizes, concat=True)), - ) + pixel_values=flatten_bn(pixel_values), + image_sizes=flatten_bn(image_sizes, concat=True), + resolve_bindings={ + "h": expected_h, + "w": expected_w, + }) if image_embeds is not None: if not isinstance(image_embeds, torch.Tensor): diff --git a/vllm/utils/tensor_schema.py b/vllm/utils/tensor_schema.py index 343df71e10..4c3acf0094 100644 --- a/vllm/utils/tensor_schema.py +++ b/vllm/utils/tensor_schema.py @@ -60,6 +60,9 @@ class TensorSchema: def __getitem__(self, item) -> Any: return getattr(self, item) + def get(self, item, default=None) -> Any: + return getattr(self, item, default) + def _match_shape_with_dynamic(self, actual: tuple[int, ...], reference: tuple[int, ...], expected_shape: tuple[Union[int, str], ...], From 8c50d62f5a51799c2ecc1ad25380a5a6dd7c7180 Mon Sep 17 00:00:00 2001 From: ZiTian Zhao Date: Mon, 11 Aug 2025 00:20:00 +0800 Subject: [PATCH 146/932] Remove redundant row_indices unsqueeze operation in MiniCPMO (#22528) Signed-off-by: zitian.zhao --- vllm/model_executor/models/minicpmo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index 1ee0a94c37..e1746695bd 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -605,7 +605,6 @@ class MiniCPMO(MiniCPMV2_6): max=size) # Create column indices for broadcasting col_indices = torch.arange(size, device=device).unsqueeze(0) - row_indices = row_indices.unsqueeze(1) start_indices = start_indices.unsqueeze(1) end_indices = end_indices.unsqueeze(1) # Vectorized mask creation From 68b254d67300a1740db900a3d0ff4252424715d7 Mon Sep 17 00:00:00 2001 From: Benji Beck Date: Sun, 10 Aug 2025 10:16:44 -0700 Subject: [PATCH 147/932] Fix TensorSchema validation test for symbolic dims (#22366) Signed-off-by: Benji Beck --- tests/standalone_tests/test_tensor_schema.py | 28 +++++++++++--------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/tests/standalone_tests/test_tensor_schema.py b/tests/standalone_tests/test_tensor_schema.py index e98aa3f53f..69744921b1 100644 --- a/tests/standalone_tests/test_tensor_schema.py +++ b/tests/standalone_tests/test_tensor_schema.py @@ -4,8 +4,8 @@ import pytest import torch -from vllm.model_executor.models.fuyu import FuyuImagePatchInputs from vllm.model_executor.models.glm4_1v import Glm4vImageEmbeddingInputs +from vllm.model_executor.models.granite_speech import GraniteSpeechAudioInputs from vllm.model_executor.models.phi3v import Phi3VImagePixelInputs @@ -129,23 +129,27 @@ def test_tensor_schema_with_invalid_resolve_binding_dims(): def test_tensor_schema_with_list_of_symbolic_dim(): - flat_data = torch.stack([torch.randn(768) for _ in range(3)]) # (bn=3, fn) - patches_per_image = [64, 64, 64] # len = bn = 3 + input_features = torch.randn(3, 10, 160) # (b=3, fi=10, 160) + input_features_mask = torch.randn(3, 8) # (b=3, fo=8) + audio_embed_sizes = [8, 8, 8] # len = b = 3 - FuyuImagePatchInputs( - flat_data=flat_data, - patches_per_image=patches_per_image, + GraniteSpeechAudioInputs( + input_features=input_features, + input_features_mask=input_features_mask, + audio_embed_sizes=audio_embed_sizes, ) def test_tensor_schema_with_list_of_symbolic_dim_mismatch_in_length(): - flat_data = torch.stack([torch.randn(768) for _ in range(4)]) # (bn=4, fn) - patches_per_image = [64, 64, 64] # len = 3 ≠ bn + input_features = torch.randn(4, 10, 160) # (b=4, fi=10, 160) + input_features_mask = torch.randn(4, 8) # (b=4, fo=8) + audio_embed_sizes = [8, 8, 8] # len = 3 ≠ b - with pytest.raises(ValueError, match="expected 'bn'=4, got 3"): - FuyuImagePatchInputs( - flat_data=flat_data, - patches_per_image=patches_per_image, + with pytest.raises(ValueError, match="expected 'b'=4, got 3"): + GraniteSpeechAudioInputs( + input_features=input_features, + input_features_mask=input_features_mask, + audio_embed_sizes=audio_embed_sizes, ) From d1af8b7be9c5ad9d2926ce215771e9cd7279147b Mon Sep 17 00:00:00 2001 From: Doug Smith Date: Sun, 10 Aug 2025 19:29:02 -0400 Subject: [PATCH 148/932] enable Docker-aware precompiled wheel setup (#22106) Signed-off-by: dougbtv --- docker/Dockerfile | 15 ++-- setup.py | 185 +++++++++++++++++++++++++--------------------- vllm/envs.py | 11 ++- 3 files changed, 116 insertions(+), 95 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 04a63f5d68..85f55cac8d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -210,16 +210,7 @@ ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_S3_NO_CREDENTIALS=0 # Flag to control whether to use pre-built vLLM wheels -ARG VLLM_USE_PRECOMPILED -# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed -ENV VLLM_USE_PRECOMPILED="" -RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \ - export VLLM_USE_PRECOMPILED=1 && \ - echo "Using precompiled wheels"; \ - else \ - unset VLLM_USE_PRECOMPILED && \ - echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \ - fi +ARG VLLM_USE_PRECOMPILED="" # if USE_SCCACHE is set, use sccache to speed up compilation RUN --mount=type=cache,target=/root/.cache/uv \ @@ -236,6 +227,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \ && export SCCACHE_IDLE_TIMEOUT=0 \ && export CMAKE_BUILD_TYPE=Release \ + && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \ + && export VLLM_DOCKER_BUILD_CONTEXT=1 \ && sccache --show-stats \ && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ && sccache --show-stats; \ @@ -249,6 +242,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ # Clean any existing CMake artifacts rm -rf .deps && \ mkdir -p .deps && \ + export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \ + export VLLM_DOCKER_BUILD_CONTEXT=1 && \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ fi diff --git a/setup.py b/setup.py index e374fcb816..7f6c787129 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,7 @@ import json import logging import os import re +import shutil import subprocess import sys from pathlib import Path @@ -281,10 +282,81 @@ class cmake_build_ext(build_ext): self.copy_file(file, dst_file) -class repackage_wheel(build_ext): +class precompiled_build_ext(build_ext): + """Disables extension building when using precompiled binaries.""" + + def run(self) -> None: + assert _is_cuda( + ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" + + def build_extensions(self) -> None: + print("Skipping build_ext: using precompiled extensions.") + return + + +class precompiled_wheel_utils: """Extracts libraries and other files from an existing wheel.""" - def get_base_commit_in_main_branch(self) -> str: + @staticmethod + def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict: + import tempfile + import zipfile + + temp_dir = None + try: + if not os.path.isfile(wheel_url_or_path): + wheel_filename = wheel_url_or_path.split("/")[-1] + temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") + wheel_path = os.path.join(temp_dir, wheel_filename) + print(f"Downloading wheel from {wheel_url_or_path} " + f"to {wheel_path}") + from urllib.request import urlretrieve + urlretrieve(wheel_url_or_path, filename=wheel_path) + else: + wheel_path = wheel_url_or_path + print(f"Using existing wheel at {wheel_path}") + + package_data_patch = {} + + with zipfile.ZipFile(wheel_path) as wheel: + files_to_copy = [ + "vllm/_C.abi3.so", + "vllm/_moe_C.abi3.so", + "vllm/_flashmla_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", + "vllm/cumem_allocator.abi3.so", + ] + + compiled_regex = re.compile( + r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") + file_members = list( + filter(lambda x: x.filename in files_to_copy, + wheel.filelist)) + file_members += list( + filter(lambda x: compiled_regex.match(x.filename), + wheel.filelist)) + + for file in file_members: + print(f"[extract] {file.filename}") + target_path = os.path.join(".", file.filename) + os.makedirs(os.path.dirname(target_path), exist_ok=True) + with wheel.open(file.filename) as src, open( + target_path, "wb") as dst: + shutil.copyfileobj(src, dst) + + pkg = os.path.dirname(file.filename).replace("/", ".") + package_data_patch.setdefault(pkg, []).append( + os.path.basename(file.filename)) + + return package_data_patch + finally: + if temp_dir is not None: + print(f"Removing temporary directory {temp_dir}") + shutil.rmtree(temp_dir) + + @staticmethod + def get_base_commit_in_main_branch() -> str: # Force to use the nightly wheel. This is mainly used for CI testing. if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: return "nightly" @@ -297,6 +369,10 @@ class repackage_wheel(build_ext): ]).decode("utf-8") upstream_main_commit = json.loads(resp_json)["sha"] + # In Docker build context, .git may be immutable or missing. + if envs.VLLM_DOCKER_BUILD_CONTEXT: + return upstream_main_commit + # Check if the upstream_main_commit exists in the local repo try: subprocess.check_output( @@ -329,86 +405,6 @@ class repackage_wheel(build_ext): "wheel may not be compatible with your dev branch: %s", err) return "nightly" - def run(self) -> None: - assert _is_cuda( - ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" - - wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) - if wheel_location is None: - base_commit = self.get_base_commit_in_main_branch() - wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - # Fallback to nightly wheel if latest commit wheel is unavailable, - # in this rare case, the nightly release CI hasn't finished on main. - if not is_url_available(wheel_location): - wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - - import zipfile - - if os.path.isfile(wheel_location): - wheel_path = wheel_location - print(f"Using existing wheel={wheel_path}") - else: - # Download the wheel from a given URL, assume - # the filename is the last part of the URL - wheel_filename = wheel_location.split("/")[-1] - - import tempfile - - # create a temporary directory to store the wheel - temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") - wheel_path = os.path.join(temp_dir, wheel_filename) - - print(f"Downloading wheel from {wheel_location} to {wheel_path}") - - from urllib.request import urlretrieve - - try: - urlretrieve(wheel_location, filename=wheel_path) - except Exception as e: - from setuptools.errors import SetupError - - raise SetupError( - f"Failed to get vLLM wheel from {wheel_location}") from e - - with zipfile.ZipFile(wheel_path) as wheel: - files_to_copy = [ - "vllm/_C.abi3.so", - "vllm/_moe_C.abi3.so", - "vllm/_flashmla_C.abi3.so", - "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", - "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", - "vllm/cumem_allocator.abi3.so", - # "vllm/_version.py", # not available in nightly wheels yet - ] - - file_members = list( - filter(lambda x: x.filename in files_to_copy, wheel.filelist)) - - # vllm_flash_attn python code: - # Regex from - # `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)` - compiled_regex = re.compile( - r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") - file_members += list( - filter(lambda x: compiled_regex.match(x.filename), - wheel.filelist)) - - for file in file_members: - print(f"Extracting and including {file.filename} " - "from existing wheel") - package_name = os.path.dirname(file.filename).replace("/", ".") - file_name = os.path.basename(file.filename) - - if package_name not in package_data: - package_data[package_name] = [] - - wheel.extract(file) - if file_name.endswith(".py"): - # python files shouldn't be added to package_data - continue - - package_data[package_name].append(file_name) - def _no_device() -> bool: return VLLM_TARGET_DEVICE == "empty" @@ -639,6 +635,29 @@ package_data = { ] } +# If using precompiled, extract and patch package_data (in advance of setup) +if envs.VLLM_USE_PRECOMPILED: + assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" + wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) + if wheel_location is not None: + wheel_url = wheel_location + else: + base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch() + wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + from urllib.request import urlopen + try: + with urlopen(wheel_url) as resp: + if resp.status != 200: + wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + except Exception as e: + print(f"[warn] Falling back to nightly wheel: {e}") + wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + + patch = precompiled_wheel_utils.extract_precompiled_and_patch_package( + wheel_url) + for pkg, files in patch.items(): + package_data.setdefault(pkg, []).extend(files) + if _no_device(): ext_modules = [] @@ -647,7 +666,7 @@ if not ext_modules: else: cmdclass = { "build_ext": - repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext + precompiled_build_ext if envs.VLLM_USE_PRECOMPILED else cmake_build_ext } setup( diff --git a/vllm/envs.py b/vllm/envs.py index f81f6dacd8..c26c7f215d 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -70,6 +70,7 @@ if TYPE_CHECKING: MAX_JOBS: Optional[str] = None NVCC_THREADS: Optional[str] = None VLLM_USE_PRECOMPILED: bool = False + VLLM_DOCKER_BUILD_CONTEXT: bool = False VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False CMAKE_BUILD_TYPE: Optional[str] = None @@ -234,8 +235,14 @@ environment_variables: dict[str, Callable[[], Any]] = { # If set, vllm will use precompiled binaries (*.so) "VLLM_USE_PRECOMPILED": - lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool( - os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), + lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in + ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), + + # Used to mark that setup.py is running in a Docker build context, + # in order to force the use of precompiled binaries. + "VLLM_DOCKER_BUILD_CONTEXT": + lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in + ("1", "true"), # Whether to force using nightly wheel in python build. # This is used for testing the nightly wheel in python build. From a554991748584b00e3bbd2ab192cbcac3f630263 Mon Sep 17 00:00:00 2001 From: Benji Beck Date: Sun, 10 Aug 2025 19:29:16 -0700 Subject: [PATCH 149/932] Migrate LlavaNextVideoPixelInputs to TensorSchema (#21843) Signed-off-by: Benji Beck --- .../model_executor/models/llava_next_video.py | 57 +++++++------------ 1 file changed, 22 insertions(+), 35 deletions(-) diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index a96df0b6f5..abc519edad 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -3,7 +3,7 @@ import math from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, TypedDict, Union +from typing import Annotated, Literal, Optional, Union import torch import torch.nn as nn @@ -25,6 +25,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .llava import init_vision_tower_for_llava @@ -35,17 +36,25 @@ from .utils import (AutoWeightsLoader, WeightsMapper, from .vision import get_vision_encoder_info -class LlavaNextVideoPixelInputs(TypedDict): - type: Literal["pixel_values_videos"] - data: Union[torch.Tensor, list[torch.Tensor]] - """ - Shape: `(batch_size, num_frames, num_channels, height, width)` +class LlavaNextVideoPixelInputs(TensorSchema): + """ + Dimensions: + - bs: Batch size + - nv: Number of videos + - nf: Number of frames + - nc: Number of channels (3) + - h: Height of each frame + - w: Width of each frame Note that `num_frames` may be different for each batch, in which case the data is passed as a list instead of a batched tensor. Note that it only supports one video input for one batch. """ + type: Literal["pixel_values_videos"] = "pixel_values_videos" + + data: Annotated[Union[torch.Tensor, list[torch.Tensor]], + TensorShape("bs", "nv", "nf", 3, "h", "w")] class LlavaNextVideoProcessingInfo(BaseProcessingInfo): @@ -320,27 +329,6 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, self.make_empty_intermediate_tensors = ( self.language_model.model.make_empty_intermediate_tensors) - def _validate_video_pixel_values( - self, data: Union[torch.Tensor, list[torch.Tensor]] - ) -> Union[torch.Tensor, list[torch.Tensor]]: - - h = w = self.config.vision_config.image_size - expected_dims = (3, h, w) - - def _validate_shape(d: torch.Tensor): - actual_dims = tuple(d.shape[2:]) - - if actual_dims != expected_dims: - expected_expr = ("num_frames", *map(str, expected_dims)) - raise ValueError( - "The expected shape of pixel values in each video frame " - f"is {expected_expr}. You supplied {tuple(d.shape)}.") - - for d in data: - _validate_shape(d) - - return data - def _parse_and_validate_video_input( self, **kwargs: object) -> Optional[LlavaNextVideoPixelInputs]: """ @@ -355,14 +343,13 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, if pixel_values_videos is None: return None - if not isinstance(pixel_values_videos, (torch.Tensor, list)): - raise ValueError("Incorrect type of pixel_values_videos. " - f"Got type: {type(pixel_values_videos)}") - - return LlavaNextVideoPixelInputs( - type="pixel_values_videos", - data=pixel_values_videos, - ) + expected_h = expected_w = self.config.vision_config.image_size + return LlavaNextVideoPixelInputs(type="pixel_values_videos", + data=pixel_values_videos, + resolve_bindings={ + "h": expected_h, + "w": expected_w, + }) def _select_image_features(self, image_features: torch.Tensor, *, strategy: str) -> torch.Tensor: From 06da44f0cbf84da771a2a1e336e06432a09875c8 Mon Sep 17 00:00:00 2001 From: Benji Beck Date: Sun, 10 Aug 2025 19:29:19 -0700 Subject: [PATCH 150/932] Migrate LlavaImageInputs to TensorSchema (#21770) Signed-off-by: Benji Beck --- vllm/model_executor/models/llava.py | 67 +++++++++++++++-------------- 1 file changed, 35 insertions(+), 32 deletions(-) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index cfc6ffd99a..708ca98995 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -3,7 +3,7 @@ from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, +from typing import (Annotated, Final, Literal, Optional, Protocol, TypeVar, Union, cast) import torch @@ -33,6 +33,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .clip import CLIPVisionModel from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP @@ -44,35 +45,46 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, from .vision import get_vision_encoder_info -class LlavaImagePixelInputs(TypedDict): - type: Literal["pixel_values"] - pixel_values: torch.Tensor +class LlavaImagePixelInputs(TensorSchema): """ - Shape: `(batch_size * num_images, num_channels, height, width)` - + Dimensions: + - bn: Batch size * number of images + - c: Number of channels (3) + - h: Height + - w: Width + Note that `height` or `width` may be different per batch and image, in which case the data is passed as a list instead of a batched tensor. """ + type: Literal["pixel_values"] = "pixel_values" + pixel_values: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")] -class PixtralHFImagePixelInputs(TypedDict): - type: Literal["pixel_values_pixtral"] - pixel_values: Union[torch.Tensor, list[torch.Tensor]] +class PixtralHFImagePixelInputs(TensorSchema): """ - Shape: `(batch_size * num_images, num_channels, height, width)` - + Dimensions: + - bn: Batch size * number of images + - c: Number of channels + - h: Height + - w: Width + Note that `height` or `width` may be different per batch and image, in which case the data is passed as a list instead of a batched tensor. """ + type: Literal["pixel_values_pixtral"] = "pixel_values_pixtral" + pixel_values: Annotated[Union[torch.Tensor, list[torch.Tensor]], + TensorShape("bn", "c", "h", "w")] -class LlavaImageEmbeddingInputs(TypedDict): - type: Literal["image_embeds"] - data: torch.Tensor - """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` - - `hidden_size` must match the hidden size of language model backbone. +class LlavaImageEmbeddingInputs(TensorSchema): """ + Dimensions: + - bn: Batch size * number of images + - ifs: Image feature size + - hs: Hidden size (must match language model backbone) + """ + type: Literal["image_embeds"] = "image_embeds" + data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")] LlavaImageInputs = Union[LlavaImagePixelInputs, PixtralHFImagePixelInputs, @@ -547,19 +559,6 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: - h = w = self.config.vision_config.image_size - expected_dims = (3, h, w) - actual_dims = tuple(data.shape[1:]) - - if actual_dims != expected_dims: - expected_expr = ("batch_size", *map(str, expected_dims)) - raise ValueError( - f"The expected shape of pixel values is {expected_expr}. " - f"You supplied {tuple(data.shape)}.") - - return data - def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[LlavaImageInputs]: pixel_values = kwargs.pop("pixel_values", None) @@ -579,10 +578,14 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): pixel_values=flatten_bn(pixel_values), ) + expected_h = expected_w = self.config.vision_config.image_size return LlavaImagePixelInputs( type="pixel_values", - pixel_values=self._validate_pixel_values( - flatten_bn(pixel_values, concat=True)), + pixel_values=flatten_bn(pixel_values, concat=True), + resolve_bindings={ + "h": expected_h, + "w": expected_w + }, ) if image_embeds is not None: From b799f4b9ea8d15d62c4f4a97926b274561fd9492 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Sun, 10 Aug 2025 19:30:00 -0700 Subject: [PATCH 151/932] [CI/Build] Fix tensorizer test for load_format change (#22583) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 1 - tests/entrypoints/openai/test_tensorizer_entrypoint.py | 2 +- tests/tensorizer_loader/test_tensorizer.py | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 221888edb3..db7351edbb 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -426,7 +426,6 @@ steps: - label: Tensorizer Test # 11min mirror_hardwares: [amdexperimental] - soft_fail: true source_file_dependencies: - vllm/model_executor/model_loader - tests/tensorizer_loader diff --git a/tests/entrypoints/openai/test_tensorizer_entrypoint.py b/tests/entrypoints/openai/test_tensorizer_entrypoint.py index 4bf3798503..058e96f203 100644 --- a/tests/entrypoints/openai/test_tensorizer_entrypoint.py +++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py @@ -44,7 +44,7 @@ def model_uri(tmp_dir): def tensorize_model_and_lora(tmp_dir, model_uri): tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri, lora_dir=tmp_dir) - args = EngineArgs(model=MODEL_NAME, device="cuda") + args = EngineArgs(model=MODEL_NAME) tensorize_lora_adapter(LORA_PATH, tensorizer_config) tensorize_vllm_model(args, tensorizer_config) diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index b8d7892e57..0fb142a1b6 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -166,7 +166,7 @@ def test_load_without_tensorizer_load_format(vllm_runner, capfd, model_ref): combined_output = out + err assert ("ValueError: Model loader extra config " "is not supported for load " - "format LoadFormat.AUTO") in combined_output + "format auto") in combined_output finally: del model gc.collect() @@ -186,7 +186,7 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd, combined_output = out + err assert ("ValueError: Model loader extra config is not supported " - "for load format LoadFormat.SAFETENSORS") in combined_output + "for load format safetensors") in combined_output finally: del model gc.collect() From 5898b135abc7b7c0ef7107d21a07d54a84314b7c Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sun, 10 Aug 2025 19:33:48 -0700 Subject: [PATCH 152/932] [BugFix] Fix KVConnectorOutput TPU breakage (#22598) Signed-off-by: Nick Hill --- tests/v1/kv_connector/unit/utils.py | 12 ++++++++---- vllm/v1/core/sched/scheduler.py | 4 ++-- vllm/v1/worker/tpu_model_runner.py | 13 +++++++++---- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 291c84d117..c22d5b861e 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -179,6 +179,13 @@ def create_model_runner_output( sampled_token = EOS_TOKEN_ID if use_eos else 0 sampled_token_ids = [[sampled_token] for _ in req_ids] + kv_connector_output = None if ( + finished_sending is None + and finished_recving is None) else KVConnectorOutput( + finished_sending=finished_sending, + finished_recving=finished_recving, + ) + # Make output data structure. return ModelRunnerOutput( req_ids=req_ids, @@ -188,10 +195,7 @@ def create_model_runner_output( logprobs=None, prompt_logprobs_dict={}, pooler_output=None, - kv_connector_output=KVConnectorOutput( - finished_sending=finished_sending, - finished_recving=finished_recving, - ), + kv_connector_output=kv_connector_output, ) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 85fc1a4a01..dcb9f4dd36 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1151,8 +1151,8 @@ class Scheduler(SchedulerInterface): scheduler the request during the next step. """ - assert self.connector is not None - self.connector.update_connector_output(kv_connector_output) + if self.connector is not None: + self.connector.update_connector_output(kv_connector_output) # KV Connector:: update recv and send status from last step. for req_id in (kv_connector_output.finished_recving or ()): diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 915869726f..ae0219458e 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1138,6 +1138,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): i, target_slice] = valid_sampled_token_ids[i] req_state.output_token_ids.extend(valid_sampled_token_ids[i]) + kv_connector_output = None if ( + finished_sending is None + and finished_recving is None) else KVConnectorOutput( + finished_sending=finished_sending, + finished_recving=finished_recving, + ) + model_runner_output = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=self.input_batch.req_id_to_index, @@ -1146,10 +1153,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logprobs=logprobs_lists, prompt_logprobs_dict=prompt_logprobs_dict, pooler_output=[], - kv_connector_output=KVConnectorOutput( - finished_sending=finished_sending, - finished_recving=finished_recving, - )) + kv_connector_output=kv_connector_output, + ) # Check there are no new graphs compiled - all the graphs should be # captured and compiled during warm up. From 1b9902806915040ac9b3029f2ab7522ec505afc3 Mon Sep 17 00:00:00 2001 From: Lifans Date: Sun, 10 Aug 2025 19:49:51 -0700 Subject: [PATCH 153/932] [Misc][gpt-oss] Add rules to label gpt-oss related PRs (#22600) Signed-off-by: Lifan Shen --- .github/mergify.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/mergify.yml b/.github/mergify.yml index d8ae509e0a..495d207d44 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -118,6 +118,20 @@ pull_request_rules: add: - qwen +- name: label-gpt-oss + description: Automatically apply gpt-oss label + conditions: + - or: + - files~=^examples/.*gpt[-_]?oss.*\.py + - files~=^tests/.*gpt[-_]?oss.*\.py + - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py + - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py + - title~=(?i)gpt[-_]?oss + actions: + label: + add: + - gpt-oss + - name: label-rocm description: Automatically apply rocm label conditions: From afa5b7ca0b417abadfa85e32f28969b72e58a885 Mon Sep 17 00:00:00 2001 From: Zhewen Li Date: Sun, 10 Aug 2025 21:29:35 -0700 Subject: [PATCH 154/932] [Misc][gpt-oss] guard import when triton kernel when not up to date (#22584) Signed-off-by: zhewenli --- .../fused_moe/gpt_oss_triton_kernels_moe.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 6d6a2e22bc..6b5284dc6c 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -5,15 +5,24 @@ from typing import TYPE_CHECKING, Any, Optional import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate) from vllm.model_executor.layers.fused_moe.utils import extract_required_args from vllm.utils import has_triton_kernels +logger = init_logger(__name__) + if has_triton_kernels(): - import triton_kernels.swiglu - from triton_kernels.matmul_ogs import FnSpecs, FusedActivation, matmul_ogs - from triton_kernels.routing import routing + try: + import triton_kernels.swiglu + from triton_kernels.matmul_ogs import (FnSpecs, FusedActivation, + matmul_ogs) + from triton_kernels.routing import routing + except ModuleNotFoundError: + logger.error( + "Failed to import Triton kernels. Please make sure your triton " + "version is compatible.") if TYPE_CHECKING: from triton_kernels.matmul_ogs import PrecisionConfig From f919d4cb8faac8c869ab87ee705dbd340fae4679 Mon Sep 17 00:00:00 2001 From: Eugene Cheah Date: Sun, 10 Aug 2025 22:52:31 -0700 Subject: [PATCH 155/932] [BugFix] Fix logits repetition penalty cuda check (#22592) --- vllm/_custom_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 92de394180..70605d3c5f 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -311,7 +311,7 @@ def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor, output_mask: A boolean tensor indicating which tokens appear in the output. repetition_penalties: The repetition penalties of shape (num_seqs, ). """ - if current_platform.is_cuda() and logits.is_contiguous(): + if logits.is_cuda and logits.is_contiguous(): apply_repetition_penalties_cuda(logits, prompt_mask, output_mask, repetition_penalties) else: From 9c97a1c3496d7d8574dd0d2b3fffeae5cc2223ca Mon Sep 17 00:00:00 2001 From: vllmellm Date: Mon, 11 Aug 2025 13:52:34 +0800 Subject: [PATCH 156/932] [ROCm][AITER] Support AITER Rope ops in RotaryEmbedding Module. (#22521) Signed-off-by: vllmellm --- .../layers/rotary_embedding/base.py | 71 ++++++++++ .../layers/rotary_embedding/common.py | 4 +- .../rotary_embedding/deepseek_scaling_rope.py | 12 +- .../rotary_embedding/rocm_aiter_rope_ops.py | 127 ++++++++++++++++++ 4 files changed, 204 insertions(+), 10 deletions(-) create mode 100644 vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py index 10fce857a8..6dfc28be7d 100644 --- a/vllm/model_executor/layers/rotary_embedding/base.py +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -8,6 +8,7 @@ import torch from vllm.model_executor.custom_op import CustomOp from .common import apply_rotary_emb_dispatch, apply_rotary_emb_torch +from .rocm_aiter_rope_ops import is_rocm_rotary_embedding_enabled @CustomOp.register("rotary_embedding") @@ -35,6 +36,7 @@ class RotaryEmbedding(CustomOp): cache = cache.to(dtype) self.cos_sin_cache: torch.Tensor self.register_buffer("cos_sin_cache", cache, persistent=False) + self.is_rocm_aiter_enabled = is_rocm_rotary_embedding_enabled() def _compute_inv_freq(self, base: float) -> torch.Tensor: """Compute the inverse frequency.""" @@ -119,6 +121,75 @@ class RotaryEmbedding(CustomOp): self.cos_sin_cache, self.is_neox_style) return query, key + def forward_hip( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + offsets: Optional[torch.Tensor] = None, + is_nope_first=False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + # currently only rotary embedding ops from AITER package are + # supported for HiP forward. + if self.is_rocm_aiter_enabled: + return self.forward_hip_rocm_aiter(positions, query, key, offsets, + is_nope_first) + return self.forward_native(positions, query, key, offsets) + + def forward_hip_rocm_aiter( + self, + positions: torch.Tensor, + # if is_nope_first + # [[batch_size, seq_len, num_heads, nope_size+rope_size] + # if NOT is_nope_first + # [[batch_size, seq_len, num_heads, rope_size+nope_size], + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + offsets: Optional[torch.Tensor] = None, + is_nope_first: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + if self.cos_sin_cache.device != query.device or \ + self.cos_sin_cache.dtype != query.dtype: + self.cos_sin_cache = self.cos_sin_cache.to(query.device, + dtype=query.dtype) + cos, sin = self.cos_sin_cache.chunk(2, dim=-1) + + cos = cos.unsqueeze(-2).unsqueeze(-2) + sin = sin.unsqueeze(-2).unsqueeze(-2) + + rotate_style = 0 if self.is_neox_style else 1 + + num_tokens = positions.numel() + + query_shape = query.shape + query = query.view(1, num_tokens, -1, self.head_size) + if key is not None: + key_shape = key.shape + key = key.view(1, num_tokens, -1, self.head_size) + + positions = positions.view(*query.shape[:2]) + if offsets is not None: + offsets = offsets.view(*query.shape[:2]) + + if not is_nope_first: + query_ = query[..., :self.rotary_dim] + key_ = key[..., :self.rotary_dim] if key is not None else None + else: + query_ = query[..., -self.rotary_dim:] + key_ = key[..., -self.rotary_dim:] if key is not None else None + + if key_ is None: + torch.ops.vllm.rocm_aiter_rotary_emb_without_key_forward_hip( + positions, sin, cos, query_, offsets, rotate_style, + is_nope_first) + return query.view(query_shape), None + + torch.ops.vllm.rocm_aiter_rotary_emb_with_key_forward_hip( + positions, sin, cos, query_, key_, offsets, rotate_style, + is_nope_first) + + return query.view(query_shape), key.view(key_shape) + def forward_xpu( self, positions: torch.Tensor, diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py index 8d821bea19..99b6bb2120 100644 --- a/vllm/model_executor/layers/rotary_embedding/common.py +++ b/vllm/model_executor/layers/rotary_embedding/common.py @@ -99,7 +99,7 @@ def yarn_linear_ramp_mask(low: float, high: float, dim: int, return ramp_func -def yarn_get_mscale(scale: float = 1) -> float: +def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: if scale <= 1: return 1.0 - return 0.1 * math.log(scale) + 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py index cd888b7334..5af671703a 100644 --- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import math from typing import Optional import torch @@ -10,13 +9,7 @@ from vllm.platforms import current_platform from .base import RotaryEmbedding from .common import (rotate_gptj, rotate_neox, yarn_find_correction_range, - yarn_linear_ramp_mask) - - -def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: - if scale <= 1: - return 1.0 - return 0.1 * mscale * math.log(scale) + 1.0 + yarn_get_mscale, yarn_linear_ramp_mask) class DeepseekScalingRotaryEmbedding(RotaryEmbedding): @@ -96,6 +89,9 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding): offsets: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """PyTorch-native implementation equivalent to forward().""" + if self.is_rocm_aiter_enabled: + return self.forward_hip_rocm_aiter(positions, query, key, offsets) + assert key is not None query_rot = query[..., :self.rotary_dim] key_rot = key[..., :self.rotary_dim] diff --git a/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py b/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py new file mode 100644 index 0000000000..91a2318bad --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py @@ -0,0 +1,127 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Optional + +import torch + +import vllm.envs as envs +from vllm.platforms import current_platform +from vllm.utils import direct_register_custom_op + + +def is_rocm_rotary_embedding_enabled() -> bool: + return (current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER) + + +def rocm_aiter_rotary_emb_without_key_forward_hip_impl( + positions: torch.Tensor, + sin: torch.Tensor, + cos: torch.Tensor, + query: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + rotate_style: int = 0, + is_nope_first: bool = False, +) -> None: + import aiter as ops + if offsets is None: + ops.rope_cached_positions_fwd_inplace( + query, + cos, + sin, + positions, + rotate_style, + reuse_freqs_front_part=True, + nope_first=is_nope_first, + ) + else: + ops.rope_cached_positions_offsets_fwd_inplace( + query, + cos, + sin, + positions, + offsets, + rotate_style, + reuse_freqs_front_part=True, + nope_first=is_nope_first, + ) + + +def rocm_aiter_rotary_emb_with_key_forward_hip_impl( + positions: torch.Tensor, + sin: torch.Tensor, + cos: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + rotate_style: int = 0, + is_nope_first: bool = False, +) -> None: + import aiter as ops + if offsets is None: + ops.rope_cached_positions_2c_fwd_inplace( + query, + key, + cos, + sin, + positions, + rotate_style, + reuse_freqs_front_part=True, + nope_first=is_nope_first, + ) + else: + ops.rope_cached_positions_offsets_2c_fwd_inplace( + query, + key, + cos, + sin, + positions, + offsets, + rotate_style, + reuse_freqs_front_part=True, + nope_first=is_nope_first, + ) + + +def rocm_aiter_rotary_emb_with_key_forward_hip_fake( + positions: torch.Tensor, + sin: torch.Tensor, + cos: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + rotate_style: int = 0, + is_nope_first: bool = False, +) -> None: + pass + + +def rocm_aiter_rotary_emb_without_key_forward_hip_fake( + positions: torch.Tensor, + sin: torch.Tensor, + cos: torch.Tensor, + query: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + rotate_style: int = 0, + is_nope_first: bool = False, +) -> None: + pass + + +if is_rocm_rotary_embedding_enabled(): + + direct_register_custom_op( + op_name="rocm_aiter_rotary_emb_with_key_forward_hip", + op_func=rocm_aiter_rotary_emb_with_key_forward_hip_impl, + mutates_args=["key", "query"], + fake_impl=rocm_aiter_rotary_emb_with_key_forward_hip_fake, + dispatch_key=current_platform.dispatch_key, + ) + + direct_register_custom_op( + op_name="rocm_aiter_rotary_emb_without_key_forward_hip", + op_func=rocm_aiter_rotary_emb_without_key_forward_hip_impl, + mutates_args=["query"], + fake_impl=rocm_aiter_rotary_emb_without_key_forward_hip_fake, + dispatch_key=current_platform.dispatch_key, + ) \ No newline at end of file From 39052dbca87616a549ab152713f1a3020b2f4eb8 Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Mon, 11 Aug 2025 02:54:59 -0300 Subject: [PATCH 157/932] Support token_type_ids in V1 with less code changes (#21985) Signed-off-by: Max de Bayser --- tests/entrypoints/openai/test_rerank.py | 4 +- tests/entrypoints/openai/test_score.py | 4 +- tests/models/language/pooling/test_scoring.py | 9 ++ vllm/entrypoints/llm.py | 54 ++++++------ vllm/entrypoints/openai/serving_score.py | 82 +++++++---------- vllm/entrypoints/score_utils.py | 40 ++++++++- vllm/model_executor/models/bert.py | 88 +++++++++++++------ vllm/model_executor/models/roberta.py | 36 ++++---- vllm/pooling_params.py | 8 +- vllm/v1/worker/gpu_model_runner.py | 40 +++++++++ 10 files changed, 235 insertions(+), 130 deletions(-) diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py index f121693e32..73364294cb 100644 --- a/tests/entrypoints/openai/test_rerank.py +++ b/tests/entrypoints/openai/test_rerank.py @@ -126,7 +126,9 @@ def test_invocations(server: RemoteOpenAIServer): invocation_output["results"]): assert rerank_result.keys() == invocations_result.keys() assert rerank_result["relevance_score"] == pytest.approx( - invocations_result["relevance_score"], rel=0.01) + invocations_result["relevance_score"], rel=0.05) + # TODO: reset this tolerance to 0.01 once we find + # an alternative to flash_attn with bfloat16 @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index 1a5df1d2db..cb6ec795ae 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -220,7 +220,9 @@ class TestModel: invocation_output["data"]): assert score_data.keys() == invocation_data.keys() assert score_data["score"] == pytest.approx( - invocation_data["score"], rel=0.01) + invocation_data["score"], rel=0.05) + # TODO: reset this tolerance to 0.01 once we find + # an alternative to flash_attn with bfloat16 def test_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]): diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py index ef9d5530cd..6b5ff70681 100644 --- a/tests/models/language/pooling/test_scoring.py +++ b/tests/models/language/pooling/test_scoring.py @@ -23,6 +23,15 @@ TEXTS_2 = [ "The capital of Germany is Berlin.", ] + +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + DTYPE = "half" diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index ca24b0c32b..4014a961c6 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -28,11 +28,15 @@ from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, apply_mistral_chat_template, parse_chat_messages, resolve_chat_template_content_format) +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.score_utils import (ScoreContentPartParam, ScoreMultiModalParam, _cosine_similarity, _validate_score_input_lens, + compress_token_type_ids, get_score_prompt) +# yapf: enable from vllm.entrypoints.utils import (_validate_truncation_size, log_non_default_args) from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt @@ -1329,6 +1333,7 @@ class LLM: model_config = self.llm_engine.model_config pooling_params.verify("score", model_config) + pooling_params_list = list[PoolingParams]() tokenization_kwargs: dict[str, Any] = {} @@ -1339,38 +1344,31 @@ class LLM: input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)] - if model_config.is_multimodal_model: - for q, d in input_pairs: - _, engine_prompt = get_score_prompt( - model_config=model_config, - data_1=q, - data_2=d, - tokenizer=tokenizer, - tokenization_kwargs=tokenization_kwargs, - ) + model_config = self.llm_engine.model_config - parsed_prompts.append(engine_prompt) - else: - for q, t in input_pairs: - if model_config.use_pad_token: - # cross_encoder models defaults to using pad_token. - prompt_inputs = tokenizer( - text=q, # type: ignore[arg-type] - text_pair=t, # type: ignore[arg-type] - **tokenization_kwargs) - else: - # `llm as reranker` models defaults to not using pad_token. - prompt_inputs = tokenizer( - text=q + t, # type: ignore[operator] - **tokenization_kwargs) - engine_prompt = TokensPrompt( - prompt_token_ids=prompt_inputs["input_ids"], - token_type_ids=prompt_inputs.get("token_type_ids")) - parsed_prompts.append(engine_prompt) + for q, d in input_pairs: + _, engine_prompt = get_score_prompt( + model_config=model_config, + data_1=q, + data_2=d, + tokenizer=tokenizer, + tokenization_kwargs=tokenization_kwargs, + ) + + if envs.VLLM_USE_V1 and (token_type_ids := engine_prompt.pop( + "token_type_ids", None)): + params = pooling_params.clone() + compressed = compress_token_type_ids(token_type_ids) + params.extra_kwargs = {"compressed_token_type_ids": compressed} + pooling_params_list.append(params) + else: + pooling_params_list.append(pooling_params) + + parsed_prompts.append(engine_prompt) self._validate_and_add_requests( prompts=parsed_prompts, - params=pooling_params, + params=pooling_params_list, use_tqdm=use_tqdm, lora_request=lora_request, ) diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 4da2094147..c246274514 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -7,6 +7,7 @@ from typing import Any, Optional, Union from fastapi import Request +from vllm import envs from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger @@ -17,11 +18,15 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, RerankDocument, ScoreResponseData, UsageInfo) from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.score_utils import (ScoreContentPartParam, ScoreMultiModalParam, _cosine_similarity, _validate_score_input_lens, + compress_token_type_ids, get_score_prompt) +# yapf: enable from vllm.entrypoints.utils import _validate_truncation_size from vllm.inputs.data import TokensPrompt from vllm.logger import init_logger @@ -158,6 +163,8 @@ class ServingScores(OpenAIServing): tokenizer=tokenizer, tokenization_kwargs=tokenization_kwargs, ) + self._validate_input(request, engine_prompt["prompt_token_ids"], + full_prompt) if request.mm_processor_kwargs is not None: engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs @@ -188,64 +195,27 @@ class ServingScores(OpenAIServing): input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)] - if self.model_config.is_multimodal_model: + preprocess_async = make_async(self._preprocess_score, + executor=self._tokenizer_executor) - preprocess_async = make_async(self._preprocess_score, - executor=self._tokenizer_executor) + preprocessed_prompts = await asyncio.gather( + *(preprocess_async(request=request, + tokenizer=tokenizer, + tokenization_kwargs=tokenization_kwargs, + data_1=t1, + data_2=t2) for t1, t2 in input_pairs)) - preprocessed_prompts = await asyncio.gather( - *(preprocess_async(request=request, - tokenizer=tokenizer, - tokenization_kwargs=tokenization_kwargs, - data_1=t1, - data_2=t2) for t1, t2 in input_pairs)) - - for full_prompt, engine_prompt in preprocessed_prompts: - request_prompts.append(full_prompt) - engine_prompts.append(engine_prompt) - - else: - tokenize_async = make_async(tokenizer.__call__, - executor=self._tokenizer_executor) - use_pad_token = self.model_config.use_pad_token - - if use_pad_token: - # cross_encoder models defaults to using pad_token. - tokenized_prompts = await asyncio.gather(*( - tokenize_async( - text=t1, # type: ignore[arg-type] - text_pair=t2, # type: ignore[arg-type] - **tokenization_kwargs) for t1, t2 in input_pairs)) - else: - # `llm as reranker` models defaults to not using pad_token. - tokenized_prompts = await asyncio.gather(*( - tokenize_async( - text=t1 + # type: ignore[operator] - t2, - **tokenization_kwargs) for t1, t2 in input_pairs)) - - for prompt_inputs, (t1, t2) in zip(tokenized_prompts, input_pairs): - sep_token = tokenizer.sep_token if (tokenizer.sep_token - and use_pad_token) else '' - request_prompt = f"{t1}{sep_token}{t2}" - - input_ids = prompt_inputs["input_ids"] - text_token_prompt = \ - self._validate_input(request, input_ids, request_prompt) - engine_prompt = TokensPrompt( - prompt_token_ids=text_token_prompt["prompt_token_ids"], - token_type_ids=prompt_inputs.get("token_type_ids")) - - request_prompts.append(request_prompt) - engine_prompts.append(engine_prompt) + for full_prompt, engine_prompt in preprocessed_prompts: + request_prompts.append(full_prompt) + engine_prompts.append(engine_prompt) # Schedule the request and get the result generator. generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] - pooling_params = request.to_pooling_params() + default_pooling_params = request.to_pooling_params() try: - pooling_params.verify("score", self.model_config) + default_pooling_params.verify("score", self.model_config) except ValueError as e: return self.create_error_response(str(e)) @@ -254,9 +224,19 @@ class ServingScores(OpenAIServing): self._log_inputs(request_id_item, request_prompts[i], - params=pooling_params, + params=default_pooling_params, lora_request=lora_request) + if envs.VLLM_USE_V1 and (token_type_ids := engine_prompt.pop( + "token_type_ids", None)): + pooling_params = default_pooling_params.clone() + compressed = compress_token_type_ids(token_type_ids) + pooling_params.extra_kwargs = { + "compressed_token_type_ids": compressed + } + else: + pooling_params = (default_pooling_params) + generator = self.engine_client.encode( engine_prompt, pooling_params, diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index f3f042355c..642d638953 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -184,15 +184,49 @@ def get_score_prompt( model_config, tokenizer, ) + from vllm.model_executor.model_loader import get_model_cls - full_prompt = apply_score_template(model_config, prompt_1, prompt_2) - - prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs) + model = get_model_cls(model_config) + if supports_score_template(model): + full_prompt = apply_score_template(model_config, prompt_1, prompt_2) + prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs) + elif model_config.use_pad_token: + # cross_encoder models defaults to using pad_token. + prompt_inputs = tokenizer(text=prompt_1, + text_pair=prompt_2, + **tokenization_kwargs) + full_prompt = tokenizer.decode(prompt_inputs["input_ids"]) + else: + # `llm as reranker` models defaults to not using pad_token. + full_prompt = prompt_1 + prompt_2 + prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs) engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"]) + if (token_type_ids := prompt_inputs.get("token_type_ids")) is not None: + engine_prompt["token_type_ids"] = token_type_ids + post_process_tokens(model_config, engine_prompt) if mm_data is not None: engine_prompt["multi_modal_data"] = mm_data return full_prompt, engine_prompt + + +def compress_token_type_ids(token_type_ids: list[int]) -> int: + """ + Return position of the first 1 or the length of the list + if not found. + """ + first_one = len(token_type_ids) + err_msg = "Token type ids are expected to be a sequence"\ + " of zeros followed by a sequence of ones" + for i, type_id in enumerate(token_type_ids): + if type_id == 0 and first_one < i: + raise ValueError(err_msg) + elif type_id == 1 and first_one > i: + first_one = i + elif type_id > 1: + raise ValueError(err_msg) + + return first_one diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 8f988903f7..3d5d5d505b 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -28,7 +28,7 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors from vllm.tasks import PoolingTask -from .interfaces import SupportsCrossEncoding, SupportsQuant, SupportsV0Only +from .interfaces import SupportsCrossEncoding, SupportsQuant from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix @@ -60,21 +60,13 @@ class BertEmbedding(nn.Module): self, input_ids: torch.Tensor, position_ids: torch.Tensor, - token_type_ids: Optional[torch.Tensor] = None, ) -> torch.Tensor: - input_shape = input_ids.size() - # Input embeddings. + token_type_ids = _decode_token_type_ids(input_ids) + inputs_embeds = self.word_embeddings(input_ids) - - # Position embeddings. position_embeddings = self.position_embeddings(position_ids) - if token_type_ids is None: - token_type_ids = torch.zeros(input_shape, - dtype=torch.long, - device=inputs_embeds.device) - token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + token_type_embeddings + position_embeddings @@ -350,25 +342,23 @@ class BertModel(nn.Module, SupportsQuant): ) -> None: super().__init__() - config = vllm_config.model_config.hf_config - self.embeddings = embedding_class(config) + self.config = vllm_config.model_config.hf_config + self.embeddings = embedding_class(self.config) self.encoder = BertEncoder(vllm_config=vllm_config, prefix=f"{prefix}.encoder") def forward( self, input_ids: torch.Tensor, - position_ids: torch.Tensor, + positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, - token_type_ids: Optional[torch.Tensor] = None, ) -> torch.Tensor: if inputs_embeds is not None: hidden_states = inputs_embeds else: hidden_states = self.embeddings(input_ids=input_ids, - position_ids=position_ids, - token_type_ids=token_type_ids) + position_ids=positions) return self.encoder(hidden_states) def _load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): @@ -468,13 +458,11 @@ class BertEmbeddingModel(nn.Module, SupportsQuant): self, input_ids: torch.Tensor, positions: torch.Tensor, - token_type_ids: Optional[torch.Tensor] = None, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: return self.model(input_ids=input_ids, - position_ids=positions, - token_type_ids=token_type_ids, + positions=positions, inputs_embeds=inputs_embeds, intermediate_tensors=intermediate_tensors) @@ -508,8 +496,53 @@ class BertEmbeddingModel(nn.Module, SupportsQuant): }) -class BertForSequenceClassification(nn.Module, SupportsV0Only, - SupportsCrossEncoding, SupportsQuant): +# Here we encode the token type ids together with the input ids. +# Since we use int 32 for the input IDs and the vocabulary size +# is way lower than 2**31, there is room to encode additional +# bits. At the same time, for cross-encoder use cases, the +# token type ids are only 0 or 1, requiring only 1 bit. +# This means that we can store the token type ids in the 31st +# bit. We void the 32nd bit because that would produce a negative +# number, which could be used to signal other things. +# +# The reason for all of this is that all the tensors that are +# passed as input to the forward function of a module marked +# with @support_torch_compile have to be persistent. So to +# avoid adding more persistent tensors in the model runner, we +# encode more information in the same persistent tensor. +# +# Since the *ForClassification module is outside of the BertModel +# which is compiled, we can do the encoding here and then separate +# the information again in the Embedding layer. Since with bit masks +# we can do this entirely with torch operations and without branching, +# it works with torch compile. + +TOKEN_TYPE_SHIFT = 30 + + +def _encode_token_type_ids(input_ids: torch.Tensor, + token_type_ids: torch.Tensor) -> None: + # input_ids can be padded to the right + input_ids[:token_type_ids.shape[0]].bitwise_or_( + token_type_ids << TOKEN_TYPE_SHIFT) + + +def _decode_token_type_ids(input_ids: torch.Tensor) -> torch.Tensor: + + ids_mask = torch.ones(input_ids.shape, + dtype=torch.int32, + device=input_ids.device) << TOKEN_TYPE_SHIFT + tokens_mask = ids_mask.bitwise_not() + + token_type_ids = input_ids.bitwise_and(ids_mask) >> TOKEN_TYPE_SHIFT + + input_ids.bitwise_and_(tokens_mask) + + return token_type_ids + + +class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, + SupportsQuant): """A model that uses Bert to provide embedding functionalities. This class encapsulates the BertModel and provides an interface for @@ -567,8 +600,13 @@ class BertForSequenceClassification(nn.Module, SupportsV0Only, inputs_embeds: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, ) -> torch.Tensor: + + if token_type_ids is not None: + assert self.bert.config.vocab_size < (1 << TOKEN_TYPE_SHIFT) + assert input_ids is not None + _encode_token_type_ids(input_ids, token_type_ids) + return self.bert(input_ids=input_ids, - position_ids=positions, + positions=positions, inputs_embeds=inputs_embeds, - intermediate_tensors=intermediate_tensors, - token_type_ids=token_type_ids) + intermediate_tensors=intermediate_tensors) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 61c8faed40..005b917982 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -14,13 +14,16 @@ from vllm.model_executor.layers.pooler import (ClassifierPooler, CLSPool, DispatchPooler, Pooler) from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) -from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel +from vllm.model_executor.models.bert import (TOKEN_TYPE_SHIFT, + BertEmbeddingModel, BertModel, + _decode_token_type_ids, + _encode_token_type_ids) from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, maybe_prefix) from vllm.sequence import IntermediateTensors from .bert_with_rope import BertWithRope, JinaRobertaModel -from .interfaces import SupportsCrossEncoding, SupportsV0Only +from .interfaces import SupportsCrossEncoding class RobertaEmbedding(nn.Module): @@ -53,17 +56,12 @@ class RobertaEmbedding(nn.Module): self, input_ids: torch.Tensor, position_ids: torch.Tensor, - token_type_ids: Optional[torch.Tensor] = None, ) -> torch.Tensor: - input_shape = input_ids.size() - inputs_embeds = self.word_embeddings(input_ids) - # Position embeddings. + token_type_ids = _decode_token_type_ids(input_ids) + + inputs_embeds = self.word_embeddings(input_ids) position_embeddings = self.position_embeddings(position_ids) - if token_type_ids is None: - token_type_ids = torch.zeros(input_shape, - dtype=torch.long, - device=inputs_embeds.device) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + token_type_embeddings + position_embeddings @@ -107,7 +105,6 @@ class RobertaEmbeddingModel(BertEmbeddingModel): self, input_ids: torch.Tensor, positions: torch.Tensor, - token_type_ids: Optional[torch.Tensor] = None, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -119,9 +116,8 @@ class RobertaEmbeddingModel(BertEmbeddingModel): position_ids=positions, padding_idx=self.padding_idx) - return self.model(input_ids, - positions, - token_type_ids=token_type_ids, + return self.model(input_ids=input_ids, + positions=positions, inputs_embeds=inputs_embeds, intermediate_tensors=intermediate_tensors) @@ -153,8 +149,7 @@ class RobertaEmbeddingModel(BertEmbeddingModel): return loader.load_weights(weights_list, mapper=mapper) -class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding, - SupportsV0Only): +class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding): """A model that uses Roberta to provide embedding functionalities. This class encapsulates the BertModel and provides an interface for @@ -226,11 +221,14 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding, replace_roberta_positions(input_ids=input_ids, position_ids=positions, padding_idx=self.padding_idx) + if token_type_ids is not None: + assert self.roberta.config.vocab_size < (1 << TOKEN_TYPE_SHIFT) + assert input_ids is not None + _encode_token_type_ids(input_ids, token_type_ids) return self.roberta(input_ids=input_ids, - position_ids=positions, + positions=positions, inputs_embeds=inputs_embeds, - intermediate_tensors=intermediate_tensors, - token_type_ids=token_type_ids) + intermediate_tensors=intermediate_tensors) # Adapted from transformers diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 7077f68353..29f037b437 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import deepcopy -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Any, Optional import msgspec @@ -46,6 +46,9 @@ class PoolingParams( requires_token_ids: bool = False """Internal use only.""" + extra_kwargs: Optional[dict[str, Any]] = None + """Internal use only.""" + output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY @property @@ -167,7 +170,8 @@ class PoolingParams( f"softmax={self.softmax}, " f"step_tag_id={self.step_tag_id}, " f"returned_token_ids={self.returned_token_ids}, " - f"requires_token_ids={self.requires_token_ids})") + f"requires_token_ids={self.requires_token_ids}, " + f"extra_kwargs={self.extra_kwargs})") def __post_init__(self) -> None: assert self.output_kind == RequestOutputKind.FINAL_ONLY,\ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 48ff50fd6b..3cde7c6e96 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -336,6 +336,41 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.reorder_batch_threshold: Optional[int] = None + def _init_model_kwargs(self, num_tokens: int): + model_kwargs = dict[str, Any]() + num_reqs = self.input_batch.num_reqs + + pooling_params = self.input_batch.pooling_metadata.pooling_params + + num_pooling_reqs = len(pooling_params) + + if num_pooling_reqs == 0: + return model_kwargs + + assert num_pooling_reqs == num_reqs + + token_type_id_requests = dict[int, Any]() + for i, param in enumerate(pooling_params): + if param.extra_kwargs is not None and \ + (token_types := param.extra_kwargs.get( + "compressed_token_type_ids")) is not None: + token_type_id_requests[i] = token_types + + if len(token_type_id_requests) == 0: + return model_kwargs + + seq_lens = self.seq_lens[:num_reqs] + token_type_ids = [] + + for i in range(num_reqs): + pos = token_type_id_requests.get(i, seq_lens[i]) + ids = (torch.arange(seq_lens[i]) >= pos).int() + token_type_ids.append(ids) + + model_kwargs["token_type_ids"] = torch.concat(token_type_ids).to( + device=self.device) + return model_kwargs + def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None: """ Update the order of requests in the batch based on the attention @@ -1504,12 +1539,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): input_ids = None inputs_embeds = self.inputs_embeds[:num_input_tokens] model_mm_kwargs = self._extract_mm_kwargs(scheduler_output) + model_kwargs = self._init_model_kwargs(num_scheduled_tokens) else: # For text-only models, we use token ids as input. # While it is possible to use embeddings as input just like the # multimodal models, it is not desirable for performance since # then the embedding layer is not included in the CUDA graph. input_ids = self.input_ids[:num_input_tokens] + model_kwargs = self._init_model_kwargs(num_input_tokens) inputs_embeds = None model_mm_kwargs = {} if self.uses_mrope: @@ -1548,6 +1585,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): model_mm_kwargs, device=self.device, ), + **model_kwargs, ) if self.use_aux_hidden_state_outputs: @@ -2211,6 +2249,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): with self.maybe_dummy_run_with_lora(self.lora_config, num_scheduled_tokens): + model_kwargs = self._init_model_kwargs(num_tokens) if self.supports_mm_inputs: input_ids = None inputs_embeds = self.inputs_embeds[:num_tokens] @@ -2252,6 +2291,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): model_mm_kwargs, device=self.device, ), + **model_kwargs, ) if self.use_aux_hidden_state_outputs: From 384a052971607f1561e734c87c9216f77f47e0fb Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Mon, 11 Aug 2025 15:13:27 +0800 Subject: [PATCH 158/932] [Misc] benchmark_moe supports expert parallel (#22251) Signed-off-by: Jee Jee Li --- benchmarks/kernels/benchmark_moe.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 72250e2fb6..13bf1be836 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -22,10 +22,10 @@ from vllm.utils import FlexibleArgumentParser FP8_DTYPE = current_platform.fp8_dtype() -def ensure_divisibility(numerator, denominator): +def ensure_divisibility(numerator, denominator, text): """Ensure that numerator is divisible by the denominator.""" - assert numerator % denominator == 0, ( - "intermediate_size {} is not divisible by tp {}.".format(numerator, denominator) + assert numerator % denominator == 0, "{} {} is not divisible by tp {}.".format( + text, numerator, denominator ) @@ -577,12 +577,10 @@ def main(args: argparse.Namespace): E = config.ffn_config.moe_num_experts topk = config.ffn_config.moe_top_k intermediate_size = config.ffn_config.ffn_hidden_size - shard_intermediate_size = 2 * intermediate_size // args.tp_size elif config.architectures[0] == "JambaForCausalLM": E = config.num_experts topk = config.num_experts_per_tok intermediate_size = config.intermediate_size - shard_intermediate_size = 2 * intermediate_size // args.tp_size elif config.architectures[0] in ( "DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM", @@ -591,17 +589,14 @@ def main(args: argparse.Namespace): E = config.n_routed_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size - shard_intermediate_size = 2 * intermediate_size // args.tp_size elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"): E = config.num_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size - shard_intermediate_size = 2 * intermediate_size // args.tp_size elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"): E = config.num_experts topk = config.moe_topk[0] intermediate_size = config.moe_intermediate_size[0] - shard_intermediate_size = 2 * intermediate_size // args.tp_size else: # Support for llama4 config = config.get_text_config() @@ -609,8 +604,14 @@ def main(args: argparse.Namespace): E = config.num_local_experts topk = config.num_experts_per_tok intermediate_size = config.intermediate_size + enable_ep = bool(args.enable_expert_parallel) + if enable_ep: + ensure_divisibility(E, args.tp_size, "Number of experts") + E = E // args.tp_size + shard_intermediate_size = 2 * intermediate_size + else: + ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size") shard_intermediate_size = 2 * intermediate_size // args.tp_size - ensure_divisibility(intermediate_size, args.tp_size) hidden_size = config.hidden_size dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" @@ -742,6 +743,7 @@ if __name__ == "__main__": parser.add_argument( "--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2 ) + parser.add_argument("--enable-expert-parallel", "-enable-ep", action="store_true") parser.add_argument( "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" ) From 1e55dfa7e552e0995630a2563aeae443945e2e81 Mon Sep 17 00:00:00 2001 From: JartX Date: Mon, 11 Aug 2025 09:13:30 +0200 Subject: [PATCH 159/932] [BUGFIX] KeyError 'layers.14.mlp.gate.g_idx' for Qwen3-MoE with GPTQ on ROCm (#22017) --- vllm/model_executor/models/qwen3_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 3d1e72299b..9b49952f37 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -149,7 +149,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module): self.gate = ReplicatedLinear(config.hidden_size, config.num_experts, bias=False, - quant_config=None, + quant_config=quant_config, prefix=f"{prefix}.gate") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: From bc1d02ac85d834c98ec2794f1122b269f4c3e45b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 11 Aug 2025 08:13:33 +0100 Subject: [PATCH 160/932] [Docs] Add comprehensive CLI reference for all large `vllm` subcommands (#22601) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/.nav.yml | 10 ++- docs/api/{summary.md => README.md} | 0 docs/cli/.meta.yml | 1 + docs/cli/.nav.yml | 8 +++ docs/cli/README.md | 87 +++++++++++++------------- docs/cli/bench/latency.md | 9 +++ docs/cli/bench/serve.md | 9 +++ docs/cli/bench/throughput.md | 9 +++ docs/cli/chat.md | 5 ++ docs/cli/complete.md | 5 ++ docs/cli/json_tip.inc.md | 9 +++ docs/cli/run-batch.md | 9 +++ docs/cli/serve.md | 9 +++ docs/configuration/engine_args.md | 10 +-- docs/mkdocs/hooks/generate_argparse.py | 49 ++++++++++----- requirements/docs.txt | 2 + vllm/benchmarks/throughput.py | 4 +- vllm/entrypoints/cli/openai.py | 70 ++++++++++++--------- vllm/entrypoints/openai/run_batch.py | 5 +- vllm/utils/__init__.py | 5 +- 20 files changed, 205 insertions(+), 110 deletions(-) rename docs/api/{summary.md => README.md} (100%) create mode 100644 docs/cli/.meta.yml create mode 100644 docs/cli/.nav.yml create mode 100644 docs/cli/bench/latency.md create mode 100644 docs/cli/bench/serve.md create mode 100644 docs/cli/bench/throughput.md create mode 100644 docs/cli/chat.md create mode 100644 docs/cli/complete.md create mode 100644 docs/cli/json_tip.inc.md create mode 100644 docs/cli/run-batch.md create mode 100644 docs/cli/serve.md diff --git a/docs/.nav.yml b/docs/.nav.yml index f57703c329..acedc32c30 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -11,7 +11,7 @@ nav: - Quick Links: - User Guide: usage/README.md - Developer Guide: contributing/README.md - - API Reference: api/summary.md + - API Reference: api/README.md - CLI Reference: cli/README.md - Timeline: - Roadmap: https://roadmap.vllm.ai @@ -58,11 +58,9 @@ nav: - CI: contributing/ci - Design Documents: design - API Reference: - - Summary: api/summary.md - - Contents: - - api/vllm/* - - CLI Reference: - - Summary: cli/README.md + - api/README.md + - api/vllm/* + - CLI Reference: cli - Community: - community/* - Blog: https://blog.vllm.ai diff --git a/docs/api/summary.md b/docs/api/README.md similarity index 100% rename from docs/api/summary.md rename to docs/api/README.md diff --git a/docs/cli/.meta.yml b/docs/cli/.meta.yml new file mode 100644 index 0000000000..0e1f7eccee --- /dev/null +++ b/docs/cli/.meta.yml @@ -0,0 +1 @@ +toc_depth: 3 \ No newline at end of file diff --git a/docs/cli/.nav.yml b/docs/cli/.nav.yml new file mode 100644 index 0000000000..6c2c09d566 --- /dev/null +++ b/docs/cli/.nav.yml @@ -0,0 +1,8 @@ +nav: + - README.md + - serve.md + - chat.md + - complete.md + - run-batch.md + - vllm bench: + - bench/*.md diff --git a/docs/cli/README.md b/docs/cli/README.md index b512a4f4ba..c708eb7958 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -1,7 +1,3 @@ ---- -toc_depth: 4 ---- - # vLLM CLI Guide The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with: @@ -16,52 +12,48 @@ Available Commands: vllm {chat,complete,serve,bench,collect-env,run-batch} ``` -When passing JSON CLI arguments, the following sets of arguments are equivalent: - -- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'` -- `--json-arg.key1 value1 --json-arg.key2.key3 value2` - -Additionally, list elements can be passed individually using `+`: - -- `--json-arg '{"key4": ["value3", "value4", "value5"]}'` -- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'` - ## serve -Start the vLLM OpenAI Compatible API server. +Starts the vLLM OpenAI Compatible API server. -??? console "Examples" +Start with a model: - ```bash - # Start with a model - vllm serve meta-llama/Llama-2-7b-hf +```bash +vllm serve meta-llama/Llama-2-7b-hf +``` - # Specify the port - vllm serve meta-llama/Llama-2-7b-hf --port 8100 +Specify the port: - # Serve over a Unix domain socket - vllm serve meta-llama/Llama-2-7b-hf --uds /tmp/vllm.sock +```bash +vllm serve meta-llama/Llama-2-7b-hf --port 8100 +``` - # Check with --help for more options - # To list all groups - vllm serve --help=listgroup +Serve over a Unix domain socket: - # To view a argument group - vllm serve --help=ModelConfig +```bash +vllm serve meta-llama/Llama-2-7b-hf --uds /tmp/vllm.sock +``` - # To view a single argument - vllm serve --help=max-num-seqs +Check with --help for more options: - # To search by keyword - vllm serve --help=max +```bash +# To list all groups +vllm serve --help=listgroup - # To view full help with pager (less/more) - vllm serve --help=page - ``` +# To view a argument group +vllm serve --help=ModelConfig -### Options +# To view a single argument +vllm serve --help=max-num-seqs ---8<-- "docs/argparse/serve.md" +# To search by keyword +vllm serve --help=max + +# To view full help with pager (less/more) +vllm serve --help=page +``` + +See [vllm serve](./serve.md) for the full reference of all available arguments. ## chat @@ -78,6 +70,8 @@ vllm chat --url http://{vllm-serve-host}:{vllm-serve-port}/v1 vllm chat --quick "hi" ``` +See [vllm chat](./chat.md) for the full reference of all available arguments. + ## complete Generate text completions based on the given prompt via the running API server. @@ -93,7 +87,7 @@ vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1 vllm complete --quick "The future of AI is" ``` -
+See [vllm complete](./complete.md) for the full reference of all available arguments. ## bench @@ -120,6 +114,8 @@ vllm bench latency \ --load-format dummy ``` +See [vllm bench latency](./bench/latency.md) for the full reference of all available arguments. + ### serve Benchmark the online serving throughput. @@ -134,6 +130,8 @@ vllm bench serve \ --num-prompts 5 ``` +See [vllm bench serve](./bench/serve.md) for the full reference of all available arguments. + ### throughput Benchmark offline inference throughput. @@ -147,6 +145,8 @@ vllm bench throughput \ --load-format dummy ``` +See [vllm bench throughput](./bench/throughput.md) for the full reference of all available arguments. + ## collect-env Start collecting environment information. @@ -159,24 +159,25 @@ vllm collect-env Run batch prompts and write results to file. -
-Examples +Running with a local file: ```bash -# Running with a local file vllm run-batch \ -i offline_inference/openai_batch/openai_example_batch.jsonl \ -o results.jsonl \ --model meta-llama/Meta-Llama-3-8B-Instruct +``` -# Using remote file +Using remote file: + +```bash vllm run-batch \ -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ -o results.jsonl \ --model meta-llama/Meta-Llama-3-8B-Instruct ``` -
+See [vllm run-batch](./run-batch.md) for the full reference of all available arguments. ## More Help diff --git a/docs/cli/bench/latency.md b/docs/cli/bench/latency.md new file mode 100644 index 0000000000..21ab13e637 --- /dev/null +++ b/docs/cli/bench/latency.md @@ -0,0 +1,9 @@ +# vllm bench latency + +## JSON CLI Arguments + +--8<-- "docs/cli/json_tip.inc.md" + +## Options + +--8<-- "docs/argparse/bench_latency.md" diff --git a/docs/cli/bench/serve.md b/docs/cli/bench/serve.md new file mode 100644 index 0000000000..f7c415c6be --- /dev/null +++ b/docs/cli/bench/serve.md @@ -0,0 +1,9 @@ +# vllm bench serve + +## JSON CLI Arguments + +--8<-- "docs/cli/json_tip.inc.md" + +## Options + +--8<-- "docs/argparse/bench_serve.md" diff --git a/docs/cli/bench/throughput.md b/docs/cli/bench/throughput.md new file mode 100644 index 0000000000..e4ff5ce43c --- /dev/null +++ b/docs/cli/bench/throughput.md @@ -0,0 +1,9 @@ +# vllm bench throughput + +## JSON CLI Arguments + +--8<-- "docs/cli/json_tip.inc.md" + +## Options + +--8<-- "docs/argparse/bench_throughput.md" diff --git a/docs/cli/chat.md b/docs/cli/chat.md new file mode 100644 index 0000000000..b006cb8de6 --- /dev/null +++ b/docs/cli/chat.md @@ -0,0 +1,5 @@ +# vllm chat + +## Options + +--8<-- "docs/argparse/chat.md" diff --git a/docs/cli/complete.md b/docs/cli/complete.md new file mode 100644 index 0000000000..400359acf4 --- /dev/null +++ b/docs/cli/complete.md @@ -0,0 +1,5 @@ +# vllm complete + +## Options + +--8<-- "docs/argparse/complete.md" diff --git a/docs/cli/json_tip.inc.md b/docs/cli/json_tip.inc.md new file mode 100644 index 0000000000..c22430c264 --- /dev/null +++ b/docs/cli/json_tip.inc.md @@ -0,0 +1,9 @@ +When passing JSON CLI arguments, the following sets of arguments are equivalent: + +- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'` +- `--json-arg.key1 value1 --json-arg.key2.key3 value2` + +Additionally, list elements can be passed individually using `+`: + +- `--json-arg '{"key4": ["value3", "value4", "value5"]}'` +- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'` \ No newline at end of file diff --git a/docs/cli/run-batch.md b/docs/cli/run-batch.md new file mode 100644 index 0000000000..f7d401b8da --- /dev/null +++ b/docs/cli/run-batch.md @@ -0,0 +1,9 @@ +# vllm run-batch + +## JSON CLI Arguments + +--8<-- "docs/cli/json_tip.inc.md" + +## Options + +--8<-- "docs/argparse/run-batch.md" diff --git a/docs/cli/serve.md b/docs/cli/serve.md new file mode 100644 index 0000000000..2c8f9d320f --- /dev/null +++ b/docs/cli/serve.md @@ -0,0 +1,9 @@ +# vllm serve + +## JSON CLI Arguments + +--8<-- "docs/cli/json_tip.inc.md" + +## Options + +--8<-- "docs/argparse/serve.md" diff --git a/docs/configuration/engine_args.md b/docs/configuration/engine_args.md index e7ca08b557..05d4f76230 100644 --- a/docs/configuration/engine_args.md +++ b/docs/configuration/engine_args.md @@ -11,15 +11,7 @@ Engine arguments control the behavior of the vLLM engine. The engine argument classes, [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs], are a combination of the configuration classes defined in [vllm.config][]. Therefore, if you are interested in developer documentation, we recommend looking at these configuration classes as they are the source of truth for types, defaults and docstrings. -When passing JSON CLI arguments, the following sets of arguments are equivalent: - -- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'` -- `--json-arg.key1 value1 --json-arg.key2.key3 value2` - -Additionally, list elements can be passed individually using `+`: - -- `--json-arg '{"key4": ["value3", "value4", "value5"]}'` -- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'` +--8<-- "docs/cli/json_tip.inc.md" ## `EngineArgs` diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index b003b5fd6c..ed5d3b0092 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -15,8 +15,14 @@ sys.modules["aiohttp"] = MagicMock() sys.modules["blake3"] = MagicMock() sys.modules["vllm._C"] = MagicMock() +from vllm.benchmarks import latency # noqa: E402 +from vllm.benchmarks import serve # noqa: E402 +from vllm.benchmarks import throughput # noqa: E402 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402 -from vllm.entrypoints.openai.cli_args import make_arg_parser # noqa: E402 +from vllm.entrypoints.cli.openai import ChatCommand # noqa: E402 +from vllm.entrypoints.cli.openai import CompleteCommand # noqa: E402 +from vllm.entrypoints.openai import cli_args # noqa: E402 +from vllm.entrypoints.openai import run_batch # noqa: E402 from vllm.utils import FlexibleArgumentParser # noqa: E402 logger = logging.getLogger("mkdocs") @@ -68,7 +74,8 @@ class MarkdownFormatter(HelpFormatter): self._markdown_output.append( f"Possible choices: {metavar}\n\n") - self._markdown_output.append(f"{action.help}\n\n") + if action.help: + self._markdown_output.append(f"{action.help}\n\n") if (default := action.default) != SUPPRESS: self._markdown_output.append(f"Default: `{default}`\n\n") @@ -78,7 +85,7 @@ class MarkdownFormatter(HelpFormatter): return "".join(self._markdown_output) -def create_parser(cls, **kwargs) -> FlexibleArgumentParser: +def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser: """Create a parser for the given class with markdown formatting. Args: @@ -88,18 +95,12 @@ def create_parser(cls, **kwargs) -> FlexibleArgumentParser: Returns: FlexibleArgumentParser: A parser with markdown formatting for the class. """ - parser = FlexibleArgumentParser() + parser = FlexibleArgumentParser(add_json_tip=False) parser.formatter_class = MarkdownFormatter with patch("vllm.config.DeviceConfig.__post_init__"): - return cls.add_cli_args(parser, **kwargs) - - -def create_serve_parser() -> FlexibleArgumentParser: - """Create a parser for the serve command with markdown formatting.""" - parser = FlexibleArgumentParser() - parser.formatter_class = lambda prog: MarkdownFormatter( - prog, starting_heading_level=4) - return make_arg_parser(parser) + _parser = add_cli_args(parser, **kwargs) + # add_cli_args might be in-place so return parser if _parser is None + return _parser or parser def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): @@ -113,10 +114,24 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): # Create parsers to document parsers = { - "engine_args": create_parser(EngineArgs), - "async_engine_args": create_parser(AsyncEngineArgs, - async_args_only=True), - "serve": create_serve_parser(), + "engine_args": + create_parser(EngineArgs.add_cli_args), + "async_engine_args": + create_parser(AsyncEngineArgs.add_cli_args, async_args_only=True), + "serve": + create_parser(cli_args.make_arg_parser), + "chat": + create_parser(ChatCommand.add_cli_args), + "complete": + create_parser(CompleteCommand.add_cli_args), + "bench_latency": + create_parser(latency.add_cli_args), + "bench_throughput": + create_parser(throughput.add_cli_args), + "bench_serve": + create_parser(serve.add_cli_args), + "run-batch": + create_parser(run_batch.make_arg_parser), } # Generate documentation for each parser diff --git a/requirements/docs.txt b/requirements/docs.txt index c589093110..a24b9c7e92 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -29,3 +29,5 @@ setproctitle torch transformers zmq +uvloop +prometheus-client diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index bbd18ca3ae..fdf6548ada 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -24,8 +24,6 @@ from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset, from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, write_to_json) from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs -from vllm.entrypoints.openai.api_server import ( - build_async_engine_client_from_engine_args) from vllm.inputs import TextPrompt, TokensPrompt from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput @@ -146,6 +144,8 @@ async def run_vllm_async( disable_detokenize: bool = False, ) -> float: from vllm import SamplingParams + from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args) async with build_async_engine_client_from_engine_args( engine_args, diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py index e71f77ba80..7c01de94a3 100644 --- a/vllm/entrypoints/cli/openai.py +++ b/vllm/entrypoints/cli/openai.py @@ -130,28 +130,33 @@ class ChatCommand(CLISubcommand): conversation.append(response_message) # type: ignore print(output) - def subparser_init( - self, - subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser: - chat_parser = subparsers.add_parser( - "chat", - help="Generate chat completions via the running API server.", - description="Generate chat completions via the running API server.", - usage="vllm chat [options]") - _add_query_options(chat_parser) - chat_parser.add_argument( + @staticmethod + def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + """Add CLI arguments for the chat command.""" + _add_query_options(parser) + parser.add_argument( "--system-prompt", type=str, default=None, help=("The system prompt to be added to the chat template, " "used for models that support system prompts.")) - chat_parser.add_argument("-q", - "--quick", - type=str, - metavar="MESSAGE", - help=("Send a single prompt as MESSAGE " - "and print the response, then exit.")) - return chat_parser + parser.add_argument("-q", + "--quick", + type=str, + metavar="MESSAGE", + help=("Send a single prompt as MESSAGE " + "and print the response, then exit.")) + return parser + + def subparser_init( + self, + subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser: + parser = subparsers.add_parser( + "chat", + help="Generate chat completions via the running API server.", + description="Generate chat completions via the running API server.", + usage="vllm chat [options]") + return ChatCommand.add_cli_args(parser) class CompleteCommand(CLISubcommand): @@ -179,25 +184,30 @@ class CompleteCommand(CLISubcommand): output = completion.choices[0].text print(output) - def subparser_init( - self, - subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser: - complete_parser = subparsers.add_parser( - "complete", - help=("Generate text completions based on the given prompt " - "via the running API server."), - description=("Generate text completions based on the given prompt " - "via the running API server."), - usage="vllm complete [options]") - _add_query_options(complete_parser) - complete_parser.add_argument( + @staticmethod + def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + """Add CLI arguments for the complete command.""" + _add_query_options(parser) + parser.add_argument( "-q", "--quick", type=str, metavar="PROMPT", help= "Send a single prompt and print the completion output, then exit.") - return complete_parser + return parser + + def subparser_init( + self, + subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser: + parser = subparsers.add_parser( + "complete", + help=("Generate text completions based on the given prompt " + "via the running API server."), + description=("Generate text completions based on the given prompt " + "via the running API server."), + usage="vllm complete [options]") + return CompleteCommand.add_cli_args(parser) def cmd_init() -> list[CLISubcommand]: diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index a10d57456b..01551a8c7f 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -20,7 +20,6 @@ from vllm.engine.arg_utils import AsyncEngineArgs, optional_type from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger # yapf: disable -from vllm.entrypoints.openai.api_server import build_async_engine_client from vllm.entrypoints.openai.protocol import (BatchRequestInput, BatchRequestOutput, BatchResponseData, @@ -34,7 +33,6 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath, OpenAIServingModels) from vllm.entrypoints.openai.serving_score import ServingScores from vllm.logger import init_logger -from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, random_uuid from vllm.version import __version__ as VLLM_VERSION @@ -469,6 +467,9 @@ async def run_batch( async def main(args: Namespace): + from vllm.entrypoints.openai.api_server import build_async_engine_client + from vllm.usage.usage_lib import UsageContext + async with build_async_engine_client( args, usage_context=UsageContext.OPENAI_BATCH_RUNNER, diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index a4997226ea..095829db83 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1682,6 +1682,8 @@ class FlexibleArgumentParser(ArgumentParser): # Set the default "formatter_class" to SortedHelpFormatter if "formatter_class" not in kwargs: kwargs["formatter_class"] = SortedHelpFormatter + # Pop kwarg "add_json_tip" to control whether to add the JSON tip + self.add_json_tip = kwargs.pop("add_json_tip", True) super().__init__(*args, **kwargs) if sys.version_info < (3, 13): @@ -1726,7 +1728,8 @@ class FlexibleArgumentParser(ArgumentParser): def format_help(self) -> str: # Add tip about JSON arguments to the epilog epilog = self.epilog or "" - if not epilog.startswith(FlexibleArgumentParser._json_tip): + if (self.add_json_tip + and not epilog.startswith(FlexibleArgumentParser._json_tip)): self.epilog = FlexibleArgumentParser._json_tip + epilog return super().format_help() From ebf7605b0dd58ff5d572d1918e52ca732025eee0 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 11 Aug 2025 15:15:27 +0800 Subject: [PATCH 161/932] [Misc] Move tensor schema tests (#22612) Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 7 ++++--- tests/utils_/__init__.py | 6 ++++++ tests/{standalone_tests => utils_}/test_tensor_schema.py | 0 tests/{ => utils_}/test_utils.py | 3 +-- tools/check_pickle_imports.py | 2 +- 5 files changed, 12 insertions(+), 6 deletions(-) create mode 100644 tests/utils_/__init__.py rename tests/{standalone_tests => utils_}/test_tensor_schema.py (100%) rename tests/{ => utils_}/test_utils.py (99%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index db7351edbb..ebcf51981e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -57,9 +57,10 @@ steps: - vllm/ - tests/mq_llm_engine - tests/async_engine - - tests/test_inputs + - tests/test_inputs.py + - tests/test_outputs.py - tests/multimodal - - tests/test_utils + - tests/utils_ - tests/worker - tests/standalone_tests/lazy_imports.py commands: @@ -70,7 +71,7 @@ steps: - pytest -v -s test_inputs.py - pytest -v -s test_outputs.py - pytest -v -s multimodal - - pytest -v -s test_utils.py # Utils + - pytest -v -s utils_ # Utils - pytest -v -s worker # Worker - label: Python-only Installation Test diff --git a/tests/utils_/__init__.py b/tests/utils_/__init__.py new file mode 100644 index 0000000000..e6b4c3f636 --- /dev/null +++ b/tests/utils_/__init__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This module is named `utils_` instead of `utils` to avoid obscuring +`tests/utils.py`. +""" diff --git a/tests/standalone_tests/test_tensor_schema.py b/tests/utils_/test_tensor_schema.py similarity index 100% rename from tests/standalone_tests/test_tensor_schema.py rename to tests/utils_/test_tensor_schema.py diff --git a/tests/test_utils.py b/tests/utils_/test_utils.py similarity index 99% rename from tests/test_utils.py rename to tests/utils_/test_utils.py index 53a34642e5..a2db1ae684 100644 --- a/tests/test_utils.py +++ b/tests/utils_/test_utils.py @@ -5,7 +5,6 @@ import asyncio import hashlib import json -import logging import pickle import socket from collections.abc import AsyncIterator @@ -29,7 +28,7 @@ from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache, merge_async_iterators, sha256, split_host_port, split_zmq_path, supports_kw, swap_dict_values) -from .utils import create_new_process_for_each_test, error_on_warning +from ..utils import create_new_process_for_each_test, error_on_warning @pytest.mark.asyncio diff --git a/tools/check_pickle_imports.py b/tools/check_pickle_imports.py index 5e99dc63eb..444e2bf53f 100644 --- a/tools/check_pickle_imports.py +++ b/tools/check_pickle_imports.py @@ -32,7 +32,7 @@ ALLOWED_FILES = set([ 'vllm/multimodal/hasher.py', 'vllm/transformers_utils/config.py', 'vllm/model_executor/models/registry.py', - 'tests/test_utils.py', + 'tests/utils_/test_utils.py', 'tests/tokenization/test_cached_tokenizer.py', 'vllm/distributed/utils.py', 'vllm/distributed/parallel_state.py', From 951b038298cae379d1321087a296882aae61fce7 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 11 Aug 2025 18:49:32 +0800 Subject: [PATCH 162/932] [Misc] Move jsontree to utils (#22622) Signed-off-by: DarkLight1337 --- vllm/inputs/registry.py | 2 +- vllm/model_executor/models/aya_vision.py | 2 +- vllm/model_executor/models/llava.py | 2 +- vllm/model_executor/models/minimax_vl_01.py | 2 +- vllm/model_executor/models/tarsier.py | 2 +- vllm/multimodal/cache.py | 2 +- vllm/multimodal/inputs.py | 2 +- vllm/{ => utils}/jsontree.py | 0 8 files changed, 7 insertions(+), 7 deletions(-) rename vllm/{ => utils}/jsontree.py (100%) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 6331a70b46..dc32365083 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -8,10 +8,10 @@ import torch from transformers import BatchFeature, PretrainedConfig, ProcessorMixin from typing_extensions import TypeVar -from vllm.jsontree import JSONTree, json_map_leaves from vllm.logger import init_logger from vllm.transformers_utils.processor import cached_processor_from_config from vllm.utils import get_allowed_kwarg_only_overrides +from vllm.utils.jsontree import JSONTree, json_map_leaves if TYPE_CHECKING: from vllm.config import ModelConfig diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index b476a4f918..5cd74bbba4 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -16,7 +16,6 @@ from transformers.models.got_ocr2.image_processing_got_ocr2 import ( get_optimal_tiled_canvas) from vllm.config import VllmConfig -from vllm.jsontree import json_map_leaves from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs @@ -29,6 +28,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.utils.jsontree import json_map_leaves from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 708ca98995..89d2817b57 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -16,7 +16,6 @@ from transformers.models.pixtral import PixtralProcessor from vllm.config import VllmConfig from vllm.inputs import InputProcessingContext -from vllm.jsontree import json_map_leaves from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -33,6 +32,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.utils.jsontree import json_map_leaves from vllm.utils.tensor_schema import TensorSchema, TensorShape from .clip import CLIPVisionModel diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index 62a7d37ec9..8107c6e8a0 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -8,7 +8,6 @@ import torch.nn as nn from transformers import BatchFeature, PretrainedConfig from vllm.config import VllmConfig -from vllm.jsontree import json_map_leaves from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -17,6 +16,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFieldConfig from vllm.sequence import IntermediateTensors +from vllm.utils.jsontree import json_map_leaves from .clip import CLIPVisionModel from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index 70cf5e95a5..c8709d866b 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -18,7 +18,6 @@ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput from vllm.config import VllmConfig from vllm.inputs import InputProcessingContext -from vllm.jsontree import json_map_leaves from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -34,6 +33,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, PromptReplacement, PromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.utils.jsontree import json_map_leaves from .clip import CLIPVisionModel from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 262b22e554..6074a4d54f 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -7,9 +7,9 @@ from typing import TypeVar, Union import torch -from vllm.jsontree import json_map_leaves, json_reduce_leaves from vllm.logger import init_logger from vllm.utils import GiB_bytes, LRUCache +from vllm.utils.jsontree import json_map_leaves, json_reduce_leaves from .inputs import MultiModalKwargs, MultiModalKwargsItem, NestedTensors diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 18aae35c6f..6d4bcef320 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -13,8 +13,8 @@ from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar, import numpy as np from typing_extensions import NotRequired, TypeAlias -from vllm.jsontree import JSONTree, json_map_leaves from vllm.utils import LazyLoader, full_groupby, is_list_of +from vllm.utils.jsontree import JSONTree, json_map_leaves if TYPE_CHECKING: import torch diff --git a/vllm/jsontree.py b/vllm/utils/jsontree.py similarity index 100% rename from vllm/jsontree.py rename to vllm/utils/jsontree.py From 14a5d903ab826b723a24a2d89631006394de76a1 Mon Sep 17 00:00:00 2001 From: danielafrimi <45691845+danielafrimi@users.noreply.github.com> Date: Mon, 11 Aug 2025 14:09:24 +0300 Subject: [PATCH 163/932] [Model] NemotronH Support (#22349) Signed-off-by: Daniel Afrimi --- vllm/model_executor/models/nemotron_h.py | 26 +++++++++++++++---- vllm/transformers_utils/configs/nemotron_h.py | 4 +-- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index eb62d5a53c..08315a1385 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -64,20 +64,32 @@ class NemotronHMLP(nn.Module): def __init__( self, config: NemotronHConfig, + layer_idx: int, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, prefix: str = "", ) -> None: super().__init__() + + hybrid_override_pattern = config.hybrid_override_pattern + mlp_index = hybrid_override_pattern[:layer_idx + 1].count("-") - 1 + if isinstance(config.intermediate_size, list): + if len(config.intermediate_size) == 1: + intermediate_size = config.intermediate_size[0] + else: + intermediate_size = config.intermediate_size[mlp_index] + else: + intermediate_size = config.intermediate_size + self.up_proj = ColumnParallelLinear( input_size=config.hidden_size, - output_size=config.intermediate_size, + output_size=intermediate_size, bias=bias, quant_config=quant_config, prefix=f"{prefix}.up_proj", ) self.down_proj = RowParallelLinear( - input_size=config.intermediate_size, + input_size=intermediate_size, output_size=config.hidden_size, bias=bias, quant_config=quant_config, @@ -110,6 +122,7 @@ class NemotronHMLPDecoderLayer(nn.Module): quant_config=quant_config, bias=config.mlp_bias, prefix=f"{prefix}.mixer", + layer_idx=layer_idx, ) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -146,7 +159,7 @@ class NemotronHMambaDecoderLayer(nn.Module): hidden_size=config.hidden_size, ssm_state_size=config.ssm_state_size, conv_kernel_size=config.conv_kernel, - intermediate_size=config.expand * config.hidden_size, + intermediate_size=config.mamba_num_heads * config.mamba_head_dim, use_conv_bias=config.use_conv_bias, use_bias=config.use_bias, n_groups=config.n_groups, @@ -205,7 +218,10 @@ class NemotronHAttention(nn.Module): # the KV heads across multiple tensor parallel GPUs. assert tp_size % self.total_num_kv_heads == 0 self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = config.hidden_size // self.total_num_heads + if hasattr(config, "head_dim") and config.head_dim is not None: + self.head_dim = config.head_dim + else: + self.head_dim = config.hidden_size // self.total_num_heads self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 @@ -481,7 +497,7 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, """ parallel_config = vllm_config.parallel_config hf_config = vllm_config.model_config.hf_config - intermediate_size = hf_config.expand * hf_config.hidden_size + intermediate_size = hf_config.mamba_num_heads * hf_config.mamba_head_dim return MambaStateShapeCalculator.mamba2_state_shape( intermediate_size=intermediate_size, diff --git a/vllm/transformers_utils/configs/nemotron_h.py b/vllm/transformers_utils/configs/nemotron_h.py index 457b3371e9..027f291154 100644 --- a/vllm/transformers_utils/configs/nemotron_h.py +++ b/vllm/transformers_utils/configs/nemotron_h.py @@ -151,7 +151,7 @@ class NemotronHConfig(PretrainedConfig): num_hidden_layers=52, hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-", num_attention_heads=32, - attention_head_dim=128, + head_dim=128, num_key_value_heads=8, # nemo: num_query_groups mlp_hidden_act="relu2", attention_bias=False, @@ -194,7 +194,7 @@ class NemotronHConfig(PretrainedConfig): self.num_hidden_layers = num_hidden_layers self.hybrid_override_pattern = hybrid_override_pattern self.num_attention_heads = num_attention_heads - self.attention_head_dim = attention_head_dim + self.head_dim = head_dim self.sliding_window = sliding_window self.max_position_embeddings = max_position_embeddings self.attention_dropout = attention_dropout From 3fa5b258455772b522d0e0d764d7dad65578310a Mon Sep 17 00:00:00 2001 From: Eric Curtin Date: Mon, 11 Aug 2025 15:22:45 +0100 Subject: [PATCH 164/932] Document aarch64 CPU support works (#22646) Signed-off-by: Eric Curtin --- docs/usage/v1_guide.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index a9492c8502..12191d3490 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -59,12 +59,12 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the ### Hardware -| Hardware | Status | -|------------|------------------------------------| -| **NVIDIA** | 🚀 | -| **AMD** | 🟢 | -| **TPU** | 🟢 | -| **CPU** | 🟢 (x86) 🟡 (MacOS) | +| Hardware | Status | +|------------|-----------------------------------------------| +| **NVIDIA** | 🚀 | +| **AMD** | 🟢 | +| **TPU** | 🟢 | +| **CPU** | 🟢 (x86\_64/aarch64) 🟡 (MacOS) | !!! note From 8e13d9fe6d486f3bfa096e28d683601d72a5a1cc Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 12 Aug 2025 00:22:25 +0800 Subject: [PATCH 165/932] [Misc] Further clean up some redundant config definitions (#22649) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/transformers_utils/config.py | 51 +++++++++++++-------- vllm/transformers_utils/configs/__init__.py | 6 +-- vllm/transformers_utils/configs/mllama.py | 31 ------------- vllm/transformers_utils/configs/nvlm_d.py | 31 ------------- 4 files changed, 34 insertions(+), 85 deletions(-) delete mode 100644 vllm/transformers_utils/configs/mllama.py delete mode 100644 vllm/transformers_utils/configs/nvlm_d.py diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 6b70164c8c..02ea0814dd 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -32,11 +32,10 @@ from vllm.logger import init_logger from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config, EAGLEConfig, JAISConfig, KimiVLConfig, MedusaConfig, - MllamaConfig, MLPSpeculatorConfig, + MLPSpeculatorConfig, Nemotron_Nano_VL_Config, - NemotronConfig, NVLM_D_Config, - OvisConfig, RWConfig, - SpeculatorsConfig, + NemotronConfig, OvisConfig, + RWConfig, SpeculatorsConfig, Step3TextConfig, Step3VLConfig, UltravoxConfig) # yapf: enable @@ -68,10 +67,6 @@ def _get_hf_token() -> Optional[str]: return None -_CONFIG_REGISTRY_OVERRIDE_HF: dict[str, type[PretrainedConfig]] = { - "mllama": MllamaConfig -} - _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = { "chatglm": ChatGLMConfig, "deepseek_vl_v2": DeepseekVLV2Config, @@ -85,18 +80,30 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = { "eagle": EAGLEConfig, "speculators": SpeculatorsConfig, "nemotron": NemotronConfig, - "NVLM_D": NVLM_D_Config, "ovis": OvisConfig, "ultravox": UltravoxConfig, "step3_vl": Step3VLConfig, "step3_text": Step3TextConfig, - **_CONFIG_REGISTRY_OVERRIDE_HF } _CONFIG_ATTRS_MAPPING: dict[str, str] = { "llm_config": "text_config", } +_AUTO_CONFIG_KWARGS_OVERRIDES: dict[str, dict[str, Any]] = { + "internvl_chat": { + "has_no_defaults_at_init": True + }, + # transformers regards mllama as is_encoder_decoder=False + # vllm needs is_encoder_decoder=True to enable cross-attention + "mllama": { + "is_encoder_decoder": True + }, + "NVLM_D": { + "has_no_defaults_at_init": True + }, +} + class ConfigFormat(str, enum.Enum): AUTO = "auto" @@ -273,11 +280,12 @@ def thinker_uses_mrope(config: PretrainedConfig) -> bool: def is_encoder_decoder(config: PretrainedConfig) -> bool: """Detect if the model with this config is used as an encoder/decoder.""" - text_config = getattr(config, "text_config", None) - if text_config is not None: - return is_encoder_decoder(text_config) - return getattr(config, "is_encoder_decoder", False) + def _is_encoder_decoder(config: PretrainedConfig) -> bool: + return getattr(config, "is_encoder_decoder", False) + + return (_is_encoder_decoder(config) + or _is_encoder_decoder(config.get_text_config())) def is_interleaved(config: PretrainedConfig) -> bool: @@ -291,13 +299,21 @@ def is_interleaved(config: PretrainedConfig) -> bool: return False +def _maybe_update_auto_config_kwargs(kwargs: dict[str, Any], model_type: str): + """ + Update kwargs for AutoConfig initialization based on model_type + """ + if model_type in _AUTO_CONFIG_KWARGS_OVERRIDES: + kwargs.update(_AUTO_CONFIG_KWARGS_OVERRIDES[model_type]) + return kwargs + + def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig: """Remap config attributes to match the expected names.""" for old_attr, new_attr in _CONFIG_ATTRS_MAPPING.items(): if hasattr(config, old_attr): if not hasattr(config, new_attr): config.update({new_attr: getattr(config, old_attr)}) - delattr(config, old_attr) logger.debug("Remapped config attribute '%s' to '%s'", old_attr, new_attr) return config @@ -408,15 +424,14 @@ def get_config( ) else: try: + kwargs = _maybe_update_auto_config_kwargs( + kwargs, model_type=model_type) config = AutoConfig.from_pretrained( model, trust_remote_code=trust_remote_code, revision=revision, code_revision=code_revision, token=_get_hf_token(), - # some old custom model's config needs - # `has_no_defaults_at_init=True` to work. - has_no_defaults_at_init=trust_remote_code, **kwargs, ) except ValueError as e: diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 82d24bb16b..8339c55bcf 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -17,13 +17,11 @@ from vllm.transformers_utils.configs.falcon import RWConfig from vllm.transformers_utils.configs.jais import JAISConfig from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig from vllm.transformers_utils.configs.medusa import MedusaConfig -from vllm.transformers_utils.configs.mllama import MllamaConfig from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig from vllm.transformers_utils.configs.moonvit import MoonViTConfig from vllm.transformers_utils.configs.nemotron import NemotronConfig from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config -from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config from vllm.transformers_utils.configs.ovis import OvisConfig from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig, @@ -34,18 +32,16 @@ from vllm.transformers_utils.configs.ultravox import UltravoxConfig __all__ = [ "ChatGLMConfig", "DeepseekVLV2Config", + "EAGLEConfig", "RWConfig", "JAISConfig", "MedusaConfig", - "EAGLEConfig", - "MllamaConfig", "MLPSpeculatorConfig", "MoonViTConfig", "KimiVLConfig", "NemotronConfig", "NemotronHConfig", "Nemotron_Nano_VL_Config", - "NVLM_D_Config", "OvisConfig", "SpeculatorsConfig", "UltravoxConfig", diff --git a/vllm/transformers_utils/configs/mllama.py b/vllm/transformers_utils/configs/mllama.py deleted file mode 100644 index f0cd2d52a5..0000000000 --- a/vllm/transformers_utils/configs/mllama.py +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from transformers.models.mllama import configuration_mllama as mllama_hf_config - - -class MllamaTextConfig(mllama_hf_config.MllamaTextConfig): - ''' - Use this class to override is_encoder_decoder: - - transformers regards mllama as is_encoder_decoder=False - - vllm needs is_encoder_decoder=True to enable cross-attention - ''' - - def __init__( - self, - **kwargs, - ): - super().__init__(**kwargs) - self.is_encoder_decoder = True - - -class MllamaConfig(mllama_hf_config.MllamaConfig): - - def __init__( - self, - text_config=None, - **kwargs, - ): - if isinstance(text_config, dict): - text_config = MllamaTextConfig(**text_config) - super().__init__(text_config=text_config, **kwargs) diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py deleted file mode 100644 index edfc506882..0000000000 --- a/vllm/transformers_utils/configs/nvlm_d.py +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Adapted from -# https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py -# -------------------------------------------------------- -# NVLM-D -# Copyright (c) 2024 NVIDIA -# Licensed under Apache 2.0 License [see LICENSE for details] -# -------------------------------------------------------- -from transformers import Qwen2Config -from transformers.configuration_utils import PretrainedConfig - - -class NVLM_D_Config(PretrainedConfig): - model_type = 'NVLM_D' - is_composition = True - - def __init__(self, vision_config=None, llm_config=None, **kwargs): - super().__init__(**kwargs) - - # Handle vision_config initialization - if vision_config is None: - vision_config = {} - - # Handle llm_config initialization - if llm_config is None: - llm_config = {} - - self.vision_config = PretrainedConfig(**vision_config) - self.text_config = Qwen2Config(**llm_config) From f7dcce7a4aabb1445c2827ac5d978a9c5e18be30 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 11 Aug 2025 12:39:08 -0400 Subject: [PATCH 166/932] [Feature] Add `VLLM_USE_DEEP_GEMM_E8M0` Env to Control E8M0 Scale (#21968) Signed-off-by: yewentao256 --- tests/kernels/moe/test_block_fp8.py | 5 +- tests/kernels/moe/test_deepep_deepgemm_moe.py | 6 +-- vllm/envs.py | 5 ++ .../layers/fused_moe/batched_deep_gemm_moe.py | 4 +- .../layers/fused_moe/fused_moe.py | 6 +-- .../layers/fused_moe/triton_deep_gemm_moe.py | 6 +-- .../model_executor/layers/quantization/fp8.py | 19 +++----- .../layers/quantization/utils/fp8_utils.py | 6 +-- vllm/utils/deep_gemm.py | 47 +++++++++++++++---- 9 files changed, 65 insertions(+), 39 deletions(-) diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py index 7dc6282326..75b2e9f791 100644 --- a/tests/kernels/moe/test_block_fp8.py +++ b/tests/kernels/moe/test_block_fp8.py @@ -16,7 +16,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_topk, modular_triton_fused_moe) from vllm.platforms import current_platform from vllm.utils import has_deep_gemm -from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used +from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used dg_available = has_deep_gemm() @@ -224,7 +224,8 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed, @pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.") -@pytest.mark.skipif(is_blackwell_deep_gemm_used(), reason="Not E8M0 scale MOE") +@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(), + reason="Not E8M0 scale MOE") @torch.inference_mode() def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed, monkeypatch): diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 266f1161a6..9b064db973 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEModularKernel) from vllm.platforms import current_platform from vllm.utils import has_deep_ep, has_deep_gemm -from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_used, +from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used, is_deep_gemm_supported) from .parallel_utils import ProcessGroupInfo, parallel_launch @@ -370,7 +370,7 @@ NUM_EXPERTS = [32] @pytest.mark.parametrize("world_dp_size", [(2, 1)]) @requires_deep_ep @requires_deep_gemm -@pytest.mark.skipif(is_blackwell_deep_gemm_used(), +@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM") def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int, topk: int, world_dp_size: tuple[int, int]): @@ -427,7 +427,7 @@ USE_FP8_DISPATCH = [False] @pytest.mark.parametrize("world_dp_size", [(2, 1)]) @requires_deep_ep @requires_deep_gemm -@pytest.mark.skipif(is_blackwell_deep_gemm_used(), +@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM") def test_ll_deepep_deepgemm_moe( mnk: tuple[int, int, int], diff --git a/vllm/envs.py b/vllm/envs.py index c26c7f215d..931edcfa7f 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -127,6 +127,7 @@ if TYPE_CHECKING: VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None VLLM_TPU_USING_PATHWAYS: bool = False VLLM_USE_DEEP_GEMM: bool = False + VLLM_USE_DEEP_GEMM_E8M0: bool = True VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False VLLM_USE_FLASHINFER_MOE_FP8: bool = False VLLM_USE_FLASHINFER_MOE_FP4: bool = False @@ -925,6 +926,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_DEEP_GEMM": lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))), + # Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs. + # E8M0 is faster on B200 but may reduce accuracy. + "VLLM_USE_DEEP_GEMM_E8M0": + lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))), # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm # JIT all the required kernels before model execution so there is no # JIT'ing in the hot-path. However, this warmup increases the engine diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index 3ccddb5299..c48a0137c3 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -12,7 +12,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( from vllm.model_executor.layers.fused_moe.utils import _resize_cache from vllm.triton_utils import tl, triton from vllm.utils.deep_gemm import (fp8_m_grouped_gemm_nt_masked, - is_blackwell_deep_gemm_used) + is_blackwell_deep_gemm_e8m0_used) logger = init_logger(__name__) @@ -176,7 +176,7 @@ def silu_mul_fp8_quant_deep_gemm( eps, fp8_min, fp8_max, - is_blackwell_deep_gemm_used(), + is_blackwell_deep_gemm_e8m0_used(), BLOCK=group_size, NUM_STAGES=8, num_warps=1, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 86cc6e0e5d..ad094c37f9 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -40,7 +40,7 @@ from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer -from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used +from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled @@ -1387,8 +1387,8 @@ def fused_experts(hidden_states: torch.Tensor, # E8M0 scale, which means we requantize the weight and input to the specific # scale. Fallen back to cutlass or triton for some cases would cause # accuracy issue. - should_use_deep_gemm = is_blackwell_deep_gemm_used() or _valid_deep_gemm( - hidden_states, w1, w2) + should_use_deep_gemm = is_blackwell_deep_gemm_e8m0_used( + ) or _valid_deep_gemm(hidden_states, w1, w2) if (allow_deep_gemm and use_fp8_w8a8 and should_use_deep_gemm): assert apply_router_weight_on_input is False assert is_act_and_mul, ( diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py index c67f7e8083..9d0ff2e061 100644 --- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -10,7 +10,7 @@ from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( DeepGemmExperts, _valid_deep_gemm, _valid_deep_gemm_shape, deep_gemm_block_shape) from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts -from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used +from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): @@ -107,7 +107,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): # Note: the deep gemm workspaces are strictly larger than the triton # workspaces so we can be pessimistic here and allocate for DeepGemm # even if we fall back to triton later, e.g. if expert maps are set. - if self.allow_deep_gemm and (is_blackwell_deep_gemm_used() + if self.allow_deep_gemm and (is_blackwell_deep_gemm_e8m0_used() or _valid_deep_gemm_shape(M, N, K)): assert self.deep_gemm_expert is not None return self.deep_gemm_expert.workspace_shapes( @@ -133,7 +133,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): extra_expert_args: Optional[dict[str, Any]]): use_deep_gemm = (self.allow_deep_gemm and (_valid_deep_gemm(hidden_states, w1, w2) - or is_blackwell_deep_gemm_used())) + or is_blackwell_deep_gemm_e8m0_used())) experts = self.deep_gemm_expert if use_deep_gemm else self.triton_expert assert experts is not None diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 8b6ed154bd..9577fa025b 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -45,7 +45,8 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.scalar_type import scalar_types from vllm.utils import has_deep_gemm -from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used +from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used, + is_deep_gemm_supported) from vllm.utils.flashinfer import has_flashinfer_moe if TYPE_CHECKING: @@ -415,10 +416,10 @@ class Fp8LinearMethod(LinearMethodBase): # Activations not quantized for marlin. del layer.input_scale - # On B200, DeepGemm only support E8M0 scale, which means we need to + # On B200, if E8M0 for DeepGemm is used, we need to # requantize the weight and input to the specific scale # at the same time. - if is_blackwell_deep_gemm_used(): + if is_blackwell_deep_gemm_e8m0_used(): assert layer.weight_block_size is not None block_sz = tuple(layer.weight_block_size) requant_weight_ue8m0_inplace( @@ -505,15 +506,9 @@ class Fp8MoEMethod(FusedMoEMethodBase): elif not self.block_quant: logger.warning_once("Model is not block quantized. Not using " "DeepGemm kernels") - elif (current_platform.is_cuda() - and current_platform.is_device_capability(90)): + elif (is_deep_gemm_supported()): logger.info_once("Using DeepGemm kernels for Fp8MoEMethod.") self.allow_deep_gemm = True - elif (current_platform.is_cuda() - and is_blackwell_deep_gemm_used()): - logger.info_once("Using DeepGemm SM100 kernels for " - "Fp8MoEMethod.") - self.allow_deep_gemm = True else: logger.warning_once( "DeepGemm not supported on the current platform.") @@ -725,7 +720,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): # DeepGemm scales need to be transposed and aligned. We try to do # it ahead of time for performance reasons. - if self.allow_deep_gemm and not is_blackwell_deep_gemm_used(): + if self.allow_deep_gemm and not is_blackwell_deep_gemm_e8m0_used(): # Lazy import to avoid CUDA initialization problems. if _is_col_major(layer.w13_weight_scale_inv): layer.w13_weight_scale_inv = \ @@ -851,7 +846,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): del layer.w13_input_scale del layer.w2_input_scale - if is_blackwell_deep_gemm_used(): + if is_blackwell_deep_gemm_e8m0_used(): assert layer.weight_block_size is not None # Re-quantise the expert weights so their scales are UE8M0. block_sz = tuple(layer.weight_block_size) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 68a061968a..2fb7ef29e4 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -20,7 +20,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils import cdiv, direct_register_custom_op, has_deep_gemm -from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used +from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used logger = init_logger(__name__) @@ -394,10 +394,8 @@ def per_token_group_quant_fp8( tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor. """ - # TODO(wentao): refactor this - # use_ue8m0 should be a global flag that could be set by user if use_ue8m0 is None: - use_ue8m0 = is_blackwell_deep_gemm_used() + use_ue8m0 = is_blackwell_deep_gemm_e8m0_used() dtype = current_platform.fp8_dtype() if dtype is None else dtype assert (x.shape[-1] % group_size == 0), ( f"the last dimension of `x` {x.shape[-1]} must be divisible " diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 174287b44b..861d9c0c00 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -31,19 +31,37 @@ def is_deep_gemm_supported() -> bool: @functools.cache -def is_blackwell_deep_gemm_used() -> bool: - """Return ``True`` if vLLM is configured to use DeepGEMM on a - Blackwell-class GPU. +def is_blackwell_deep_gemm_e8m0_used() -> bool: + """Return ``True`` if vLLM is configured to use DeepGEMM " + "E8M0 scale on a Blackwell-class GPU. """ - if not (envs.VLLM_USE_DEEP_GEMM and has_deep_gemm()): + if not (envs.VLLM_USE_DEEP_GEMM): + logger.debug_once("DeepGEMM E8M0 disabled: VLLM_USE_DEEP_GEMM=0.") + return False + + if not has_deep_gemm(): + logger.debug_once("DeepGEMM E8M0 disabled: DeepGEMM backend missing.") + return False + + if not envs.VLLM_USE_DEEP_GEMM_E8M0: + logger.debug_once("DeepGEMM E8M0 disabled: VLLM_USE_DEEP_GEMM_E8M0=0.") return False _lazy_init() + if _fp8_gemm_nt_impl is None: + logger.debug_once( + "DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found") return False - return (current_platform.is_cuda() - and current_platform.is_device_capability(100)) + enabled = (current_platform.is_cuda() + and current_platform.has_device_capability(100)) + if enabled: + logger.debug_once("DeepGEMM E8M0 enabled on Blackwell GPU.") + else: + logger.debug_once( + "DeepGEMM E8M0 disabled: not running on Blackwell GPU.") + return enabled def _missing(*_: Any, **__: Any) -> NoReturn: @@ -109,21 +127,30 @@ def fp8_gemm_nt(*args, **kwargs): _lazy_init() if _fp8_gemm_nt_impl is None: return _missing(*args, **kwargs) - return _fp8_gemm_nt_impl(*args, **kwargs) + return _fp8_gemm_nt_impl( + *args, + disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(), + **kwargs) def m_grouped_fp8_gemm_nt_contiguous(*args, **kwargs): _lazy_init() if _grouped_impl is None: return _missing(*args, **kwargs) - return _grouped_impl(*args, **kwargs) + return _grouped_impl( + *args, + disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(), + **kwargs) def fp8_m_grouped_gemm_nt_masked(*args, **kwargs): _lazy_init() if _grouped_masked_impl is None: return _missing(*args, **kwargs) - return _grouped_masked_impl(*args, **kwargs) + return _grouped_masked_impl( + *args, + disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(), + **kwargs) def _ceil_to_ue8m0(x: torch.Tensor): @@ -181,6 +208,6 @@ __all__ = [ "m_grouped_fp8_gemm_nt_contiguous", "fp8_m_grouped_gemm_nt_masked", "per_block_cast_to_fp8", - "is_blackwell_deep_gemm_used", + "is_blackwell_deep_gemm_e8m0_used", "is_deep_gemm_supported", ] From 16fb668b61c8d21d1e86f0fa4aa876beb7647a8d Mon Sep 17 00:00:00 2001 From: GuanLuo <41310872+GuanLuo@users.noreply.github.com> Date: Mon, 11 Aug 2025 09:40:55 -0700 Subject: [PATCH 167/932] fix: NIXL connector transfers partial block to pass full multi-modal context (#21074) Signed-off-by: GuanLuo --- .../kv_connector/unit/test_nixl_connector.py | 18 ++- .../unit/test_remote_decode_lifecycle.py | 23 ++-- .../unit/test_remote_prefill_lifecycle.py | 104 +++++++++++++++++- .../kv_connector/v1/nixl_connector.py | 26 ++--- 4 files changed, 130 insertions(+), 41 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index c5ca7df836..c673983235 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -173,9 +173,9 @@ def test_prompt_less_than_block_size(): """ Test that we can handle case where prompt is < block. - In this case, the P worker will send empty remote_block_ids. - The D worker should not schedule an async read in this case, - since there is nothing to pull. + In this case, the P worker will still send remote_block_ids of the + partial block. The D worker should schedule an async read + in this case. """ vllm_config = create_vllm_config() scheduler = create_scheduler(vllm_config) @@ -184,22 +184,20 @@ def test_prompt_less_than_block_size(): BLOCK_SIZE = vllm_config.cache_config.block_size NUM_TOKENS = int(BLOCK_SIZE * 0.5) - # Request will have 0 remote blocks. + # Request will have 1 partial remote block. request = create_request(request_id=1, num_tokens=NUM_TOKENS, do_remote_prefill=True, - num_remote_blocks=0) + num_remote_blocks=1) scheduler.add_request(request) scheduler_output = scheduler.schedule() - # This request should not have to read async. + # This request will read async. kv_connector_metadata = scheduler_output.kv_connector_metadata assert kv_connector_metadata is not None assert isinstance(kv_connector_metadata, NixlConnectorMetadata) - assert len(kv_connector_metadata.reqs_to_recv) == 0 - - # This request should be scheduled regularly. - assert len(scheduler_output.scheduled_new_reqs) == 1 + assert len(kv_connector_metadata.reqs_to_recv) == 1 + assert len(scheduler_output.scheduled_new_reqs) == 0 class FakeNixlConnectorWorker(NixlConnectorWorker): diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py index 76394a540a..1bddfef0f2 100644 --- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py @@ -121,13 +121,18 @@ def test_short_prompt_lifecycle(): model_runner_output = create_model_runner_output(reqs=[request]) # (1c): update_from_output() - # Since tokens < block_size, there will be no kv xfer. - # So this should be cleaned up immediately. - _ = scheduler.update_from_output(scheduler_output, model_runner_output) + # Even though tokens < block_size, there will be kv xfer for partial block. + eco = scheduler.update_from_output(scheduler_output, model_runner_output) + kv_transfer_params = eco[0].outputs[0].kv_transfer_params + + assert (len(kv_transfer_params["remote_block_ids"]) == 1) # Confirm we do not have any memory leaks after req lifecycle. - # We need one more call to schedule() to clear data for persistent batch. - _ = scheduler.schedule() + # We need to mark sending finish to clear data for persistent batch. + scheduler_output = scheduler.schedule() + model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) + model_runner_output.finished_sending = [request.request_id] + scheduler.update_from_output(scheduler_output, model_runner_output) assert_scheduler_empty(scheduler) @@ -169,16 +174,16 @@ def test_prefix_cache_lifecycle(): eco = scheduler.update_from_output(scheduler_output, model_runner_output) kv_transfer_params = eco[0].outputs[0].kv_transfer_params - # Ensure we send all block ids, even if there is a cache hit. + # Ensure we send all block ids, including the partial blocks, + # even if there is a cache hit. assert (len( - kv_transfer_params["remote_block_ids"]) == NUM_EXTERNAL_FULL_BLOCKS) + kv_transfer_params["remote_block_ids"]) == (NUM_EXTERNAL_FULL_BLOCKS + + 1)) # STEP (2): Ensure it is freed. scheduler_output = scheduler.schedule() - scheduler.schedule() model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) model_runner_output.kv_connector_output = KVConnectorOutput( finished_sending=[request_remote.request_id]) scheduler.update_from_output(scheduler_output, model_runner_output) - _ = scheduler.schedule() assert_scheduler_empty(scheduler) diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py index 3d52ea526d..87f7490698 100644 --- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py @@ -362,7 +362,7 @@ def test_cannot_schedule_after_recv(): BLOCK_SIZE = vllm_config.cache_config.block_size # Prompt will use 2 blocks + 1 block after we schedule. NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS) - NUM_TOKENS_REMOTE = int(BLOCK_SIZE * (NUM_PROMPT_BLOCKS + 0.5)) + NUM_TOKENS_REMOTE = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS) request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS_LOCAL) request_remote = create_request(request_id=2, @@ -393,14 +393,24 @@ def test_cannot_schedule_after_recv(): assert len(scheduler.running) == 1 assert len(scheduler.waiting) == 1 - # Step 4: try to schedule, not enough blocks. + # Step 4: try to schedule, remote request is put to running list + # because the transfer is completed. + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output( + reqs=[request_normal, request_remote]) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 2 + assert len(scheduler.waiting) == 0 + + # Step 5: Remote request will be put back to waiting list + # because it needs new block to hold generated token. scheduler_output = scheduler.schedule() model_runner_output = create_model_runner_output(reqs=[request_normal]) scheduler.update_from_output(scheduler_output, model_runner_output) assert len(scheduler.running) == 1 assert len(scheduler.waiting) == 1 - # Step 5: finish the request, free it. + # Step 6: finish the request, free it. scheduler_output = scheduler.schedule() model_runner_output = create_model_runner_output(reqs=[request_normal], use_eos=True) @@ -408,15 +418,99 @@ def test_cannot_schedule_after_recv(): assert len(scheduler.running) == 0 assert len(scheduler.waiting) == 1 - # Step 6: now we can schedule (with 2 blocks computed). + # Step 7: now we can schedule (with 2 blocks computed), + # request is retrieved from preempted list. scheduler_output = scheduler.schedule() model_runner_output = create_model_runner_output(reqs=[request_remote]) - assert (scheduler_output.scheduled_new_reqs[0].num_computed_tokens == + assert (scheduler_output.scheduled_cached_reqs.num_computed_tokens[0] == NUM_PROMPT_BLOCKS * BLOCK_SIZE) scheduler.update_from_output(scheduler_output, model_runner_output) assert len(scheduler.running) == 1 assert len(scheduler.waiting) == 0 + # Step 8: free everything. + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_remote], + use_eos=True) + scheduler.update_from_output(scheduler_output, model_runner_output) + _ = scheduler.schedule() + assert_scheduler_empty(scheduler) + + +def test_cannot_recv(): + """ + Test that we can handle no schedule KV block transfer due to not + enough remaining KV blocks. + """ + + # NOTE: the KVCacheManager will use 1 null block. + # So there are 5 total working blocks. + TOTAL_NUM_BLOCKS = 6 + vllm_config = create_vllm_config() + scheduler = create_scheduler(vllm_config, num_blocks=TOTAL_NUM_BLOCKS) + + # Prime the KVCache. + NUM_PROMPT_BLOCKS = 2 + BLOCK_SIZE = vllm_config.cache_config.block_size + # Prompt will use 2 blocks + 1 block after we schedule. + NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS) + NUM_TOKENS_REMOTE = int(BLOCK_SIZE * (NUM_PROMPT_BLOCKS + 0.5)) + + request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS_LOCAL) + request_remote = create_request(request_id=2, + num_tokens=NUM_TOKENS_REMOTE, + do_remote_prefill=True) + + # STEP 1: 3 blocks are in use (2 for prompt, 1 for decode). + scheduler.add_request(request_normal) + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_normal]) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 0 + + # Step 2: 3 blocks are in use, + # need 3 new for remote blocks but only 2 are available. + scheduler.add_request(request_remote) + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_normal]) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 1 + # Should not have KV transfer in progress. + assert (request_remote.status != RequestStatus.WAITING_FOR_REMOTE_KVS) + + # Step 3: finish the request, free it. + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_normal], + use_eos=True) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 0 + assert len(scheduler.waiting) == 1 + + # Step 4: now we can initiate KV transfer (with 2 blocks computed). + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[]) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 0 + assert len(scheduler.waiting) == 1 + assert (request_remote.status == RequestStatus.WAITING_FOR_REMOTE_KVS) + + # Step 5: finish recving (5 blocks in use) + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output( + reqs=[], finished_recving=[request_remote.request_id]) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 0 + assert len(scheduler.waiting) == 1 + + # Step 6: schedule remote request + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_remote]) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 0 + # Step 7: free everything. scheduler_output = scheduler.schedule() model_runner_output = create_model_runner_output(reqs=[request_remote], diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index e7fc2b1181..a6eeb27853 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -29,7 +29,7 @@ from vllm.distributed.utils import divide from vllm.forward_context import ForwardContext from vllm.logger import init_logger from vllm.platforms import _Backend, current_platform -from vllm.utils import make_zmq_path, make_zmq_socket, round_down +from vllm.utils import make_zmq_path, make_zmq_socket from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.request import RequestStatus @@ -275,10 +275,7 @@ class NixlConnectorScheduler: if params is not None and params.get("do_remote_prefill"): # Remote prefill: get all prompt blocks from remote. - assert num_computed_tokens % self.block_size == 0 - rounded_num_prompt_tokens = round_down( - len(request.prompt_token_ids), self.block_size) - count = max(rounded_num_prompt_tokens - num_computed_tokens, 0) + count = len(request.prompt_token_ids) - num_computed_tokens if count > 0: return count, True @@ -301,18 +298,16 @@ class NixlConnectorScheduler: # NOTE: when accelerator is not directly supported by Nixl, # prefilled blocks need to be saved to host memory before transfer. - # figure out full computed blocks to save + # save all blocks block_ids = blocks.get_block_ids()[0] - all_full = request.num_tokens % self.block_size == 0 - full_block_ids = (block_ids if all_full else block_ids[:-1]) # TODO: skip the blocks that are already in the host xfer buffer. # Currently, the host xfer buffer block is 1-to-1 mapped to device # kv blocks, so host blocks won't be flushed as long as its device # block is not overwritten; and it will be safe to skip saving them # to host xfer buffer. - if full_block_ids: + if block_ids: self._reqs_need_save[request.request_id] = \ - (request, full_block_ids) + (request, block_ids) elif params.get("do_remote_prefill"): if params.get("remote_block_ids"): if all(p in params for p in ("remote_engine_id", "remote_host", @@ -401,12 +396,9 @@ class NixlConnectorScheduler: or request.status != RequestStatus.FINISHED_LENGTH_CAPPED): return False, None - # Get computed blocks. - all_full = request.num_computed_tokens % self.block_size == 0 - computed_block_ids = block_ids if all_full else block_ids[:-1] - - # If prompt < block_size, no xfer so free blocks immediately. - delay_free_blocks = len(computed_block_ids) > 0 + # TODO: check whether block_ids actually ever be 0. If not we could + # remove the conditional below + delay_free_blocks = len(block_ids) > 0 if delay_free_blocks: # Prefill request on remote. It will be read from D upon completion @@ -416,7 +408,7 @@ class NixlConnectorScheduler: return delay_free_blocks, dict( do_remote_prefill=True, do_remote_decode=False, - remote_block_ids=computed_block_ids, + remote_block_ids=block_ids, remote_engine_id=self.engine_id, remote_host=self.side_channel_host, remote_port=self.side_channel_port, From 84cf78acee1e75bfa163863b3674aeb3ba266844 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 12 Aug 2025 00:41:37 +0800 Subject: [PATCH 168/932] [Model] Pooling models default to using chunked prefill & prefix caching if supported. (#20930) Signed-off-by: wang.yuqi --- tests/entrypoints/llm/test_classify.py | 6 + .../entrypoints/openai/test_classification.py | 15 +++ tests/models/language/pooling/mteb_utils.py | 12 +- .../pooling/test_auto_prefix_cache_support.py | 93 ++++++++++++++ tests/models/language/pooling/test_baai.py | 117 +++++++++--------- .../pooling/test_bge_reranker_v2_gemma.py | 8 +- .../language/pooling/test_cross_encoder.py | 12 +- tests/models/language/pooling/test_gte.py | 87 ++++++------- .../models/language/pooling/test_intfloat.py | 44 +++---- tests/models/language/pooling/test_jina.py | 14 ++- .../language/pooling/test_mxbai_rerank.py | 15 +-- tests/models/language/pooling/test_nomic.py | 27 ++-- .../language/pooling/test_qwen3_reranker.py | 15 +-- .../pooling/test_snowflake_arctic_embed.py | 67 +++++----- tests/models/utils.py | 18 +++ tests/test_config.py | 14 +++ vllm/config/__init__.py | 8 ++ vllm/engine/arg_utils.py | 9 +- vllm/entrypoints/llm.py | 4 + vllm/model_executor/layers/pooler.py | 38 ++---- vllm/model_executor/models/adapters.py | 4 +- vllm/model_executor/models/bert.py | 16 +-- vllm/model_executor/models/bert_with_rope.py | 4 +- vllm/model_executor/models/interfaces.py | 14 +++ vllm/model_executor/models/internlm2.py | 3 +- vllm/model_executor/models/jamba.py | 4 +- vllm/model_executor/models/modernbert.py | 6 +- vllm/model_executor/models/qwen2_rm.py | 16 +-- vllm/model_executor/models/registry.py | 6 +- vllm/model_executor/models/roberta.py | 4 +- vllm/v1/worker/gpu_model_runner.py | 13 +- 31 files changed, 452 insertions(+), 261 deletions(-) create mode 100644 tests/models/language/pooling/test_auto_prefix_cache_support.py diff --git a/tests/entrypoints/llm/test_classify.py b/tests/entrypoints/llm/test_classify.py index abdce8935e..71e76abcb7 100644 --- a/tests/entrypoints/llm/test_classify.py +++ b/tests/entrypoints/llm/test_classify.py @@ -65,3 +65,9 @@ def test_pooling_params(llm: LLM): assert torch.allclose( softmax(wo_activation), w_activation, atol=1e-2 ), "w_activation should be close to activation(wo_activation)." + + +def test_encode_api(llm: LLM): + err_msg = "pooling_task must be one of.+" + with pytest.raises(ValueError, match=err_msg): + llm.encode(prompts, use_tqdm=False) diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py index 886267c211..30078fe902 100644 --- a/tests/entrypoints/openai/test_classification.py +++ b/tests/entrypoints/openai/test_classification.py @@ -211,3 +211,18 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str): assert torch.allclose( F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2 ), "w_activation should be close to activation(wo_activation)." + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_pooling(server: RemoteOpenAIServer, model_name: str): + # pooling api uses ALL pooling, which does not support chunked prefill. + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": "test", + "encoding_format": "float" + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 77aaddb4f5..d024c76ddd 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -177,9 +177,12 @@ def mteb_test_embed_models(hf_runner, max_model_len=None, **vllm_extra_kwargs) as vllm_model: + model_config = vllm_model.llm.llm_engine.model_config + if model_info.architecture: - assert (model_info.architecture - in vllm_model.llm.llm_engine.model_config.architectures) + assert model_info.architecture in model_config.architectures + assert (model_config._model_info.default_pooling_type == + model_info.default_pooling_type) vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS) @@ -286,7 +289,12 @@ def mteb_test_rerank_models(hf_runner, **vllm_extra_kwargs) as vllm_model: model_config = vllm_model.llm.llm_engine.model_config + + if model_info.architecture: + assert (model_info.architecture in model_config.architectures) assert model_config.hf_config.num_labels == 1 + assert (model_config._model_info.default_pooling_type == + model_info.default_pooling_type) vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model), tasks=MTEB_RERANK_TASKS, diff --git a/tests/models/language/pooling/test_auto_prefix_cache_support.py b/tests/models/language/pooling/test_auto_prefix_cache_support.py new file mode 100644 index 0000000000..15e24c59d1 --- /dev/null +++ b/tests/models/language/pooling/test_auto_prefix_cache_support.py @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch +from transformers import AutoModelForSequenceClassification + +from tests.models.language.pooling.embed_utils import ( + run_embedding_correctness_test) + + +@pytest.mark.parametrize( + "model", + ["jason9693/Qwen2.5-1.5B-apeach"], +) +@pytest.mark.parametrize("dtype", ["half"]) +def test_classify_models( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, +) -> None: + + example_prompts = example_prompts * 2 + + with vllm_runner(model, + max_model_len=512, + dtype=dtype, + enable_prefix_caching=True) as vllm_model: + cache_config = vllm_model.llm.llm_engine.cache_config + assert cache_config.enable_prefix_caching + vllm_outputs = vllm_model.classify(example_prompts) + + with hf_runner(model, + dtype=dtype, + auto_cls=AutoModelForSequenceClassification) as hf_model: + hf_outputs = hf_model.classify(example_prompts) + + for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): + hf_output = torch.tensor(hf_output) + vllm_output = torch.tensor(vllm_output) + + assert torch.allclose(hf_output, vllm_output, + 1e-3 if dtype == "float" else 1e-2) + + +@pytest.mark.parametrize( + "model", + ["Qwen/Qwen3-Embedding-0.6B"], +) +@pytest.mark.parametrize("dtype", ["half"]) +def test_embed_models( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, +): + example_prompts = [str(s).strip() for s in example_prompts] * 2 + + with vllm_runner( + model, + runner="pooling", + max_model_len=None, + enable_prefix_caching=True, + ) as vllm_model: + cache_config = vllm_model.llm.llm_engine.cache_config + assert cache_config.enable_prefix_caching + vllm_outputs = vllm_model.embed(example_prompts) + + with hf_runner( + model, + is_sentence_transformer=True, + ) as hf_model: + run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs) + + +@pytest.mark.parametrize( + "model", + [ + "intfloat/e5-small", + "Alibaba-NLP/gte-Qwen2-1.5B-instruct", # is_causal == False + "papluca/xlm-roberta-base-language-detection", + ]) +@pytest.mark.parametrize("dtype", ["half"]) +def test_non_causal_models(hf_runner, vllm_runner, example_prompts, model: str, + dtype: str) -> None: + with vllm_runner(model, + max_model_len=512, + dtype=dtype, + enable_prefix_caching=True) as vllm_model: + cache_config = vllm_model.llm.llm_engine.cache_config + assert not cache_config.enable_prefix_caching diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py index 64a8f25220..6fbe0e82d7 100644 --- a/tests/models/language/pooling/test_baai.py +++ b/tests/models/language/pooling/test_baai.py @@ -2,73 +2,78 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from ...utils import EmbedModelInfo, RerankModelInfo +from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo, + EmbedModelInfo, LASTPoolingEmbedModelInfo, + RerankModelInfo) from .embed_utils import correctness_test_embed_models from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models MODELS = [ ########## BertModel - EmbedModelInfo("BAAI/bge-base-en", - architecture="BertModel", - enable_test=True), - EmbedModelInfo("BAAI/bge-base-zh", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-small-en", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-small-zh", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-large-en", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-large-zh", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-large-zh-noinstruct", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-base-en-v1.5", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-base-zh-v1.5", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-small-en-v1.5", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-small-zh-v1.5", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-large-en-v1.5", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-large-zh-v1.5", - architecture="BertModel", - enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-base-en", + architecture="BertModel", + enable_test=True), + CLSPoolingEmbedModelInfo("BAAI/bge-base-zh", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-small-en", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-small-zh", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-large-en", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-large-zh", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-noinstruct", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-base-en-v1.5", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-base-zh-v1.5", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-small-en-v1.5", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-small-zh-v1.5", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-large-en-v1.5", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-v1.5", + architecture="BertModel", + enable_test=False), ########## XLMRobertaModel - EmbedModelInfo("BAAI/bge-m3", - architecture="XLMRobertaModel", - enable_test=True), + CLSPoolingEmbedModelInfo("BAAI/bge-m3", + architecture="XLMRobertaModel", + enable_test=True), ########## Qwen2Model - EmbedModelInfo("BAAI/bge-code-v1", - architecture="Qwen2Model", - dtype="float32", - enable_test=True), + LASTPoolingEmbedModelInfo("BAAI/bge-code-v1", + architecture="Qwen2Model", + dtype="float32", + enable_test=True), ] RERANK_MODELS = [ ########## XLMRobertaForSequenceClassification - RerankModelInfo("BAAI/bge-reranker-base", - architecture="XLMRobertaForSequenceClassification", - enable_test=True), - RerankModelInfo("BAAI/bge-reranker-large", - architecture="XLMRobertaForSequenceClassification", - enable_test=False), - RerankModelInfo("BAAI/bge-reranker-v2-m3", - architecture="XLMRobertaForSequenceClassification", - enable_test=False) + CLSPoolingRerankModelInfo( + "BAAI/bge-reranker-base", + architecture="XLMRobertaForSequenceClassification", + enable_test=True), + CLSPoolingRerankModelInfo( + "BAAI/bge-reranker-large", + architecture="XLMRobertaForSequenceClassification", + enable_test=False), + CLSPoolingRerankModelInfo( + "BAAI/bge-reranker-v2-m3", + architecture="XLMRobertaForSequenceClassification", + enable_test=False) ] diff --git a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py index 7fa9485dbc..206524d7ca 100644 --- a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py +++ b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py @@ -8,12 +8,12 @@ import torch from tests.conftest import HfRunner -from .mteb_utils import (RerankModelInfo, VllmMtebEncoder, - mteb_test_rerank_models) +from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo +from .mteb_utils import VllmMtebEncoder, mteb_test_rerank_models RERANK_MODELS = [ - RerankModelInfo("BAAI/bge-reranker-v2-gemma", - architecture="GemmaForSequenceClassification"), + LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma", + architecture="GemmaForSequenceClassification"), ] PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'." # noqa: E501 diff --git a/tests/models/language/pooling/test_cross_encoder.py b/tests/models/language/pooling/test_cross_encoder.py index 9a33063d7b..8c1bc5779b 100644 --- a/tests/models/language/pooling/test_cross_encoder.py +++ b/tests/models/language/pooling/test_cross_encoder.py @@ -2,13 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from .mteb_utils import RerankModelInfo, mteb_test_rerank_models +from ...utils import (CLSPoolingRerankModelInfo, LASTPoolingRerankModelInfo, + RerankModelInfo) +from .mteb_utils import mteb_test_rerank_models RERANK_MODELS = [ - RerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2", - architecture="BertForSequenceClassification"), - RerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls", - architecture="Qwen3ForSequenceClassification") + CLSPoolingRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2", + architecture="BertForSequenceClassification"), + LASTPoolingRerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls", + architecture="Qwen3ForSequenceClassification") ] diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 48a0cd64fe..5a5fdfbb21 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -4,57 +4,58 @@ from typing import Any import pytest -from ...utils import check_transformers_version -from .embed_utils import EmbedModelInfo, correctness_test_embed_models +from ...utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo, + LASTPoolingEmbedModelInfo, check_transformers_version) +from .embed_utils import correctness_test_embed_models from .mteb_utils import mteb_test_embed_models MODELS = [ ########## BertModel - EmbedModelInfo("thenlper/gte-large", - architecture="BertModel", - enable_test=True), - EmbedModelInfo("thenlper/gte-base", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("thenlper/gte-small", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("thenlper/gte-large-zh", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("thenlper/gte-base-zh", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("thenlper/gte-small-zh", - architecture="BertModel", - enable_test=False), + CLSPoolingEmbedModelInfo("thenlper/gte-large", + architecture="BertModel", + enable_test=True), + CLSPoolingEmbedModelInfo("thenlper/gte-base", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("thenlper/gte-small", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("thenlper/gte-large-zh", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("thenlper/gte-base-zh", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("thenlper/gte-small-zh", + architecture="BertModel", + enable_test=False), ########### NewModel - EmbedModelInfo("Alibaba-NLP/gte-multilingual-base", - architecture="GteNewModel", - enable_test=True), - EmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5", - architecture="GteNewModel", - enable_test=True), - EmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5", - architecture="GteNewModel", - enable_test=True), + CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base", + architecture="GteNewModel", + enable_test=True), + CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5", + architecture="GteNewModel", + enable_test=True), + CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5", + architecture="GteNewModel", + enable_test=True), ########### Qwen2ForCausalLM - EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", - architecture="Qwen2ForCausalLM", - enable_test=True), + LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", + architecture="Qwen2ForCausalLM", + enable_test=True), ########## ModernBertModel - EmbedModelInfo("Alibaba-NLP/gte-modernbert-base", - architecture="ModernBertModel", - enable_test=True), + CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base", + architecture="ModernBertModel", + enable_test=True), ########## Qwen3ForCausalLM - EmbedModelInfo("Qwen/Qwen3-Embedding-0.6B", - architecture="Qwen3ForCausalLM", - dtype="float32", - enable_test=True), - EmbedModelInfo("Qwen/Qwen3-Embedding-4B", - architecture="Qwen3ForCausalLM", - dtype="float32", - enable_test=False), + LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B", + architecture="Qwen3ForCausalLM", + dtype="float32", + enable_test=True), + LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-4B", + architecture="Qwen3ForCausalLM", + dtype="float32", + enable_test=False), ] diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py index d899aaada2..e48bdbe940 100644 --- a/tests/models/language/pooling/test_intfloat.py +++ b/tests/models/language/pooling/test_intfloat.py @@ -2,34 +2,34 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from ...utils import EmbedModelInfo +from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo from .embed_utils import correctness_test_embed_models from .mteb_utils import mteb_test_embed_models MODELS = [ ########## BertModel - EmbedModelInfo("intfloat/e5-small", - architecture="BertModel", - enable_test=True), - EmbedModelInfo("intfloat/e5-base", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("intfloat/e5-large", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("intfloat/multilingual-e5-small", - architecture="BertModel", - enable_test=False), + CLSPoolingEmbedModelInfo("intfloat/e5-small", + architecture="BertModel", + enable_test=True), + CLSPoolingEmbedModelInfo("intfloat/e5-base", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("intfloat/e5-large", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-small", + architecture="BertModel", + enable_test=False), ########## XLMRobertaModel - EmbedModelInfo("intfloat/multilingual-e5-base", - architecture="XLMRobertaModel", - enable_test=True), - EmbedModelInfo("intfloat/multilingual-e5-large", - architecture="XLMRobertaModel", - enable_test=False), - EmbedModelInfo("intfloat/multilingual-e5-large-instruct", - architecture="XLMRobertaModel", - enable_test=False), + CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-base", + architecture="XLMRobertaModel", + enable_test=True), + CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large", + architecture="XLMRobertaModel", + enable_test=False), + CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large-instruct", + architecture="XLMRobertaModel", + enable_test=False), ] diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 59b634428c..37c5bdc97d 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -6,20 +6,22 @@ import pytest from vllm import PoolingParams -from ...utils import EmbedModelInfo, RerankModelInfo +from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo, + EmbedModelInfo, RerankModelInfo) from .embed_utils import (check_embeddings_close, correctness_test_embed_models, matryoshka_fy) from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models EMBEDDING_MODELS = [ - EmbedModelInfo("jinaai/jina-embeddings-v3", - architecture="XLMRobertaModel", - is_matryoshka=True) + CLSPoolingEmbedModelInfo("jinaai/jina-embeddings-v3", + architecture="XLMRobertaModel", + is_matryoshka=True) ] RERANK_MODELS = [ - RerankModelInfo("jinaai/jina-reranker-v2-base-multilingual", - architecture="XLMRobertaForSequenceClassification") + CLSPoolingRerankModelInfo( + "jinaai/jina-reranker-v2-base-multilingual", + architecture="XLMRobertaForSequenceClassification") ] diff --git a/tests/models/language/pooling/test_mxbai_rerank.py b/tests/models/language/pooling/test_mxbai_rerank.py index e74c58744d..480bd5e456 100644 --- a/tests/models/language/pooling/test_mxbai_rerank.py +++ b/tests/models/language/pooling/test_mxbai_rerank.py @@ -7,15 +7,16 @@ import torch from tests.conftest import HfRunner -from .mteb_utils import RerankModelInfo, mteb_test_rerank_models +from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo +from .mteb_utils import mteb_test_rerank_models RERANK_MODELS = [ - RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2", - architecture="Qwen2ForSequenceClassification", - enable_test=True), - RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2", - architecture="Qwen2ForSequenceClassification", - enable_test=False) + LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2", + architecture="Qwen2ForSequenceClassification", + enable_test=True), + LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2", + architecture="Qwen2ForSequenceClassification", + enable_test=False) ] diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py index e16ec239a3..2d05958e9b 100644 --- a/tests/models/language/pooling/test_nomic.py +++ b/tests/models/language/pooling/test_nomic.py @@ -3,22 +3,23 @@ import pytest -from .embed_utils import EmbedModelInfo, correctness_test_embed_models +from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo +from .embed_utils import correctness_test_embed_models from .mteb_utils import mteb_test_embed_models MODELS = [ - EmbedModelInfo("nomic-ai/nomic-embed-text-v1", - architecture="NomicBertModel", - enable_test=True), - EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5", - architecture="NomicBertModel", - enable_test=False), - EmbedModelInfo("nomic-ai/CodeRankEmbed", - architecture="NomicBertModel", - enable_test=False), - EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe", - architecture="NomicBertModel", - enable_test=True) + CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1", + architecture="NomicBertModel", + enable_test=True), + CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1.5", + architecture="NomicBertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("nomic-ai/CodeRankEmbed", + architecture="NomicBertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe", + architecture="NomicBertModel", + enable_test=True) ] diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py index 68e96f3270..37f5566a33 100644 --- a/tests/models/language/pooling/test_qwen3_reranker.py +++ b/tests/models/language/pooling/test_qwen3_reranker.py @@ -8,15 +8,16 @@ import torch from tests.conftest import HfRunner from tests.utils import multi_gpu_test -from .mteb_utils import RerankModelInfo, mteb_test_rerank_models +from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo +from .mteb_utils import mteb_test_rerank_models RERANK_MODELS = [ - RerankModelInfo("Qwen/Qwen3-Reranker-0.6B", - architecture="Qwen3ForSequenceClassification", - enable_test=True), - RerankModelInfo("Qwen/Qwen3-Reranker-4B", - architecture="Qwen3ForSequenceClassification", - enable_test=False) + LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B", + architecture="Qwen3ForSequenceClassification", + enable_test=True), + LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B", + architecture="Qwen3ForSequenceClassification", + enable_test=False) ] diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py index d6b5dbd083..585fa0e683 100644 --- a/tests/models/language/pooling/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py @@ -3,42 +3,43 @@ import pytest -from .embed_utils import EmbedModelInfo, correctness_test_embed_models +from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo +from .embed_utils import correctness_test_embed_models from .mteb_utils import mteb_test_embed_models MODELS = [ - EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs", - is_matryoshka=False, - architecture="BertModel", - enable_test=True), - EmbedModelInfo("Snowflake/snowflake-arctic-embed-s", - is_matryoshka=False, - architecture="BertModel", - enable_test=False), - EmbedModelInfo("Snowflake/snowflake-arctic-embed-m", - is_matryoshka=False, - architecture="BertModel", - enable_test=False), - EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long", - is_matryoshka=False, - architecture="NomicBertModel", - enable_test=True), - EmbedModelInfo("Snowflake/snowflake-arctic-embed-l", - is_matryoshka=False, - architecture="BertModel", - enable_test=False), - EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5", - is_matryoshka=True, - architecture="BertModel", - enable_test=True), - EmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0", - is_matryoshka=True, - architecture="XLMRobertaModel", - enable_test=True), - EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0", - is_matryoshka=True, - architecture="GteModel", - enable_test=True), + CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-xs", + is_matryoshka=False, + architecture="BertModel", + enable_test=True), + CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-s", + is_matryoshka=False, + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m", + is_matryoshka=False, + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long", + is_matryoshka=False, + architecture="NomicBertModel", + enable_test=True), + CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l", + is_matryoshka=False, + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5", + is_matryoshka=True, + architecture="BertModel", + enable_test=True), + CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0", + is_matryoshka=True, + architecture="XLMRobertaModel", + enable_test=True), + CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0", + is_matryoshka=True, + architecture="GteModel", + enable_test=True), ] diff --git a/tests/models/utils.py b/tests/models/utils.py index 11ddf45c8e..84aeb927c5 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -345,16 +345,34 @@ class EmbedModelInfo(NamedTuple): matryoshka_dimensions: Optional[list[int]] = None architecture: str = "" dtype: str = "auto" + default_pooling_type: str = "" enable_test: bool = True +class CLSPoolingEmbedModelInfo(EmbedModelInfo): + default_pooling_type: str = "CLS" + + +class LASTPoolingEmbedModelInfo(EmbedModelInfo): + default_pooling_type: str = "LAST" + + class RerankModelInfo(NamedTuple): name: str architecture: str = "" dtype: str = "auto" + default_pooling_type: str = "" enable_test: bool = True +class CLSPoolingRerankModelInfo(RerankModelInfo): + default_pooling_type: str = "CLS" + + +class LASTPoolingRerankModelInfo(RerankModelInfo): + default_pooling_type: str = "LAST" + + def dummy_hf_overrides( hf_config: PretrainedConfig, *, diff --git a/tests/test_config.py b/tests/test_config.py index 19b1b74e42..957771a422 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -227,6 +227,20 @@ def test_get_pooling_config_from_args(): assert asdict(pooling_config) == asdict(override_pooler_config) +@pytest.mark.parametrize( + ("model_id", "default_pooling_type", "pooling_type"), + [ + ("tomaarsen/Qwen3-Reranker-0.6B-seq-cls", "LAST", "LAST"), # LLM + ("intfloat/e5-small", "CLS", "MEAN"), # BertModel + ("Qwen/Qwen2.5-Math-RM-72B", "ALL", "ALL"), # reward + ("Qwen/Qwen2.5-Math-PRM-7B", "STEP", "STEP") # step reward + ]) +def test_default_pooling_type(model_id, default_pooling_type, pooling_type): + model_config = ModelConfig(model_id) + assert model_config._model_info.default_pooling_type == default_pooling_type + assert model_config.pooler_config.pooling_type == pooling_type + + @pytest.mark.skipif(current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm.") def test_get_bert_tokenization_sentence_transformer_config(): diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 700d29f956..03ab034c62 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -871,6 +871,10 @@ class ModelConfig: if getattr(pooler_config, k) is None: setattr(pooler_config, k, v) + default_pooling_type = self._model_info.default_pooling_type + if pooler_config.pooling_type is None: + pooler_config.pooling_type = default_pooling_type + return pooler_config return None @@ -3844,6 +3848,10 @@ class VllmConfig: disable_chunked_prefill_reasons.append( "Only \"last\" pooling supports chunked " "prefill and prefix caching; disabling both.") + elif not getattr(self.model_config.hf_config, "is_causal", True): + disable_chunked_prefill_reasons.append( + "Only models using causal attention supports chunked " + "prefill and prefix caching; disabling both.") if disable_chunked_prefill_reasons: for reason in disable_chunked_prefill_reasons: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 4767201617..41a6da709b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1600,11 +1600,10 @@ class EngineArgs: else: pooling_type = model_config.pooler_config.pooling_type - - # TODO: when encoder models are supported we'll have to - # check for causal attention here. - incremental_prefill_supported = (pooling_type is not None and - pooling_type.lower() == "last") + is_causal = getattr(model_config.hf_config, "is_causal", True) + incremental_prefill_supported = (pooling_type is not None + and pooling_type.lower() == "last" + and is_causal) action = "Enabling" if \ incremental_prefill_supported else "Disabling" diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 4014a961c6..915f14a29b 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1100,6 +1100,10 @@ class LLM: "Try passing `--runner pooling` to use the model as a " "pooling model.") + if pooling_task not in self.supported_tasks: + raise ValueError( + f"pooling_task must be one of {self.supported_tasks}.") + if prompt_token_ids is not None: parsed_prompts = self._convert_v1_inputs( prompts=cast(Optional[Union[str, list[str]]], prompts), diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 0f2e58eb9b..e2162e5cbf 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -44,15 +44,14 @@ class ResolvedPoolingConfig: task: PoolingTask @classmethod - def from_config_with_defaults( + def from_config( cls, task: PoolingTask, pooler_config: PoolerConfig, - pooling_type: PoolingType, ) -> "ResolvedPoolingConfig": + assert pooler_config.pooling_type is not None return cls(task=task, - pooling_type=PoolingType[pooler_config.pooling_type] - if pooler_config.pooling_type is not None else pooling_type) + pooling_type=PoolingType[pooler_config.pooling_type]) @dataclass(frozen=True) @@ -68,32 +67,20 @@ class Pooler(nn.Module, ABC): """The interface required for all poolers used in pooling models in vLLM.""" @staticmethod - def for_encode( - pooler_config: PoolerConfig, - *, - default_pooling_type: PoolingType = PoolingType.ALL, - ): - resolved_config = ResolvedPoolingConfig.from_config_with_defaults( - task="encode", - pooler_config=pooler_config, - pooling_type=default_pooling_type, - ) - - if resolved_config.pooling_type == PoolingType.STEP: + def for_encode(pooler_config: PoolerConfig): + if pooler_config.pooling_type == "STEP": return StepPooler() + resolved_config = ResolvedPoolingConfig(task="encode", + pooling_type=PoolingType.ALL) + return SimplePooler.from_config(resolved_config) @staticmethod - def for_embed( - pooler_config: PoolerConfig, - *, - default_pooling_type: PoolingType = PoolingType.LAST, - ): - resolved_config = ResolvedPoolingConfig.from_config_with_defaults( + def for_embed(pooler_config: PoolerConfig): + resolved_config = ResolvedPoolingConfig.from_config( task="embed", pooler_config=pooler_config, - pooling_type=default_pooling_type, ) return SimplePooler.from_config(resolved_config) @@ -102,13 +89,10 @@ class Pooler(nn.Module, ABC): def for_classify( pooler_config: PoolerConfig, classifier: Optional[ClassifierFn], - *, - default_pooling_type: PoolingType = PoolingType.LAST, ): - resolved_config = ResolvedPoolingConfig.from_config_with_defaults( + resolved_config = ResolvedPoolingConfig.from_config( task="classify", pooler_config=pooler_config, - pooling_type=default_pooling_type, ) pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 867de2c68b..1dbe70f84a 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -182,8 +182,8 @@ def as_seq_cls_model(cls: _T) -> _T: assert pooler_config is not None pooling_type_str = pooler_config.pooling_type - pooling_type = (PoolingType.LAST if pooling_type_str is None else - PoolingType[pooling_type_str]) + assert pooling_type_str is not None + pooling_type = PoolingType[pooling_type_str] self.pooler = DispatchPooler({ "encode": diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 3d5d5d505b..6638f06f98 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -28,7 +28,8 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors from vllm.tasks import PoolingTask -from .interfaces import SupportsCrossEncoding, SupportsQuant +from .interfaces import (SupportsCrossEncoding, SupportsQuant, + default_pooling_type) from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix @@ -327,6 +328,7 @@ class BertOutput(nn.Module): @support_torch_compile +@default_pooling_type("CLS") class BertModel(nn.Module, SupportsQuant): is_pooling_model = True @@ -401,6 +403,7 @@ class BertModel(nn.Module, SupportsQuant): return loaded_params +@default_pooling_type("ALL") class BertPoolingModel(BertModel): is_pooling_model = True @@ -431,6 +434,7 @@ class BertPoolingModel(BertModel): return loaded_params +@default_pooling_type("CLS") class BertEmbeddingModel(nn.Module, SupportsQuant): """A model that uses Bert to provide embedding functionalities. @@ -486,13 +490,8 @@ class BertEmbeddingModel(nn.Module, SupportsQuant): def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler: return DispatchPooler({ - "encode": - Pooler.for_encode(pooler_config), - "embed": - Pooler.for_embed( - pooler_config, - default_pooling_type=PoolingType.CLS, - ), + "encode": Pooler.for_encode(pooler_config), + "embed": Pooler.for_embed(pooler_config), }) @@ -541,6 +540,7 @@ def _decode_token_type_ids(input_ids: torch.Tensor) -> torch.Tensor: return token_type_ids +@default_pooling_type("CLS") class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, SupportsQuant): """A model that uses Bert to provide embedding functionalities. diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 050f18f16e..e18b7b7ffa 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -27,7 +27,8 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.interfaces import SupportsQuant +from vllm.model_executor.models.interfaces import (SupportsQuant, + default_pooling_type) from vllm.model_executor.models.utils import WeightsMapper from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform @@ -401,6 +402,7 @@ class BertWithRopeEncoder(nn.Module): @support_torch_compile +@default_pooling_type("CLS") class BertWithRope(nn.Module, SupportsQuant): hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index b6d9877cd0..46caf3fce4 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -641,6 +641,20 @@ def supports_cross_encoding( return is_pooling_model(model) and _supports_cross_encoding(model) +def default_pooling_type(pooling_type: str) -> object: + """Set default_pooling_type decorator. """ + + def func(model: object): + model.default_pooling_type = pooling_type + return model + + return func + + +def get_default_pooling_type(model: Union[type[object], object]) -> str: + return getattr(model, "default_pooling_type", "LAST") + + class SupportsQuant: """The interface required for all models that support quantization.""" diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index d29779a35e..d0c4bf5450 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -31,7 +31,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP +from .interfaces import SupportsLoRA, SupportsPP, default_pooling_type from .utils import (is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -401,6 +401,7 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): return loaded_params +@default_pooling_type("ALL") class InternLM2ForRewardModel(InternLM2ForCausalLM): is_pooling_model = True diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index c1033aff07..fbd310121a 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -22,8 +22,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer from vllm.model_executor.layers.mamba.mamba_utils import ( MambaStateShapeCalculator) -from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler, - PoolingType) +from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) @@ -604,6 +603,5 @@ class JambaForSequenceClassification(JambaForCausalLM): Pooler.for_classify( pooler_config, classifier=self.score, - default_pooling_type=PoolingType.LAST, ), }) diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 761fce815e..2c3bdd1c93 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -26,7 +26,8 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors from vllm.tasks import PoolingTask -from .interfaces import SupportsCrossEncoding, SupportsV0Only +from .interfaces import (SupportsCrossEncoding, SupportsV0Only, + default_pooling_type) from .utils import WeightsMapper, maybe_prefix @@ -201,6 +202,7 @@ class ModernBertEncoderLayer(nn.Module): @support_torch_compile +@default_pooling_type("CLS") class ModernBertModel(nn.Module): hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={"layers.": "encoder_layer.layers."}) @@ -264,7 +266,6 @@ class ModernBertPooler(Pooler): self.pooling = PoolingMethod.from_pooling_type(pooling_type) self.dense = nn.Linear(config.hidden_size, config.hidden_size, config.classifier_bias) - self.pooling_type = config.classifier_pooling self.act = nn.GELU() self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps, @@ -294,6 +295,7 @@ class ModernBertPooler(Pooler): return pooled_output +@default_pooling_type("CLS") class ModernBertForSequenceClassification(nn.Module, SupportsV0Only, SupportsCrossEncoding): diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 9b6b70c75c..e0a30e04c6 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -15,11 +15,10 @@ from torch import nn from vllm.config import VllmConfig from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler, - PoolingType) +from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP +from .interfaces import SupportsLoRA, SupportsPP, default_pooling_type from .qwen2 import Qwen2Model from .utils import AutoWeightsLoader, maybe_prefix @@ -90,6 +89,7 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP): return loader.load_weights(weights) +@default_pooling_type("ALL") class Qwen2ForRewardModel(Qwen2RewardBaseModel): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -103,6 +103,7 @@ class Qwen2ForRewardModel(Qwen2RewardBaseModel): {"encode": Pooler.for_encode(pooler_config)}, ) +@default_pooling_type("STEP") class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -112,10 +113,5 @@ class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel): pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None - self.pooler = DispatchPooler({ - "encode": - Pooler.for_encode( - pooler_config, - default_pooling_type=PoolingType.STEP, - ) - }) + self.pooler = DispatchPooler( + {"encode": Pooler.for_encode(pooler_config)}) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index aca3d84f00..1b0c902c5e 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -25,8 +25,8 @@ from vllm.logger import init_logger from vllm.transformers_utils.dynamic_module import ( try_get_class_from_dynamic_module) -from .interfaces import (has_inner_state, has_noops, is_attention_free, - is_hybrid, supports_cross_encoding, +from .interfaces import (get_default_pooling_type, has_inner_state, has_noops, + is_attention_free, is_hybrid, supports_cross_encoding, supports_multimodal, supports_multimodal_raw_input, supports_pp, supports_transcription, supports_v0_only) from .interfaces_base import is_pooling_model, is_text_generation_model @@ -305,6 +305,7 @@ class _ModelInfo: architecture: str is_text_generation_model: bool is_pooling_model: bool + default_pooling_type: str supports_cross_encoding: bool supports_multimodal: bool supports_multimodal_raw_input: bool @@ -323,6 +324,7 @@ class _ModelInfo: architecture=model.__name__, is_text_generation_model=is_text_generation_model(model), is_pooling_model=is_pooling_model(model), + default_pooling_type=get_default_pooling_type(model), supports_cross_encoding=supports_cross_encoding(model), supports_multimodal=supports_multimodal(model), supports_multimodal_raw_input=supports_multimodal_raw_input(model), diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 005b917982..32a4a2c9a2 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -23,7 +23,7 @@ from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, from vllm.sequence import IntermediateTensors from .bert_with_rope import BertWithRope, JinaRobertaModel -from .interfaces import SupportsCrossEncoding +from .interfaces import SupportsCrossEncoding, default_pooling_type class RobertaEmbedding(nn.Module): @@ -86,6 +86,7 @@ class RobertaClassificationHead(nn.Module): return x +@default_pooling_type("CLS") class RobertaEmbeddingModel(BertEmbeddingModel): """A model that uses Roberta to provide embedding functionalities. @@ -149,6 +150,7 @@ class RobertaEmbeddingModel(BertEmbeddingModel): return loader.load_weights(weights_list, mapper=mapper) +@default_pooling_type("CLS") class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding): """A model that uses Roberta to provide embedding functionalities. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3cde7c6e96..045a06d927 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1272,7 +1272,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if not is_pooling_model(model): return [] - return list(model.pooler.get_supported_tasks()) + supported_tasks = list(model.pooler.get_supported_tasks()) + + if (self.scheduler_config.chunked_prefill_enabled + and "encode" in supported_tasks): + supported_tasks.remove("encode") + + logger.info_once("Chunked prefill is not supported with " + "encode task which using ALL pooling. " + "Please turn off chunked prefill by " + "`--no-enable-chunked-prefill` before using it.") + + return supported_tasks def get_supported_tasks(self) -> tuple[SupportedTask, ...]: tasks = list[SupportedTask]() From c90fb03df566cd76b0e69f91158108909da80c51 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 12 Aug 2025 01:00:58 +0800 Subject: [PATCH 169/932] [CI/Build] Skip Mllama HF runner tests with Transformers v4.55.0 (#22659) Signed-off-by: Isotr0py <2037008807@qq.com> --- .../models/multimodal/generation/test_mllama.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/models/multimodal/generation/test_mllama.py b/tests/models/multimodal/generation/test_mllama.py index 2bb01e494d..b413c4d6b3 100644 --- a/tests/models/multimodal/generation/test_mllama.py +++ b/tests/models/multimodal/generation/test_mllama.py @@ -6,6 +6,7 @@ from typing import Optional, overload import pytest import torch from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer +from transformers import __version__ as TRANSFORMERS_VERSION from vllm import LLM, SamplingParams from vllm.attention.backends.flash_attn import FlashAttentionMetadata @@ -285,6 +286,10 @@ def clear_cache(): @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) +@pytest.mark.skipif( + TRANSFORMERS_VERSION == "4.55.0", + reason="Transformers v4.55.0 has a regression issue on mllama, " + "see: https://github.com/huggingface/transformers/pull/40083") def test_models_single_leading_image(hf_runner, vllm_runner, image_assets, model, sizes, dtype, max_tokens, num_logprobs, @@ -313,6 +318,10 @@ def test_models_single_leading_image(hf_runner, vllm_runner, image_assets, @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) +@pytest.mark.skipif( + TRANSFORMERS_VERSION == "4.55.0", + reason="Transformers v4.55.0 has a regression issue on mllama, " + "see: https://github.com/huggingface/transformers/pull/40083") def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets, model, dtype, max_tokens, num_logprobs, attn_backend: _Backend) -> None: @@ -362,6 +371,10 @@ def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets, @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) +@pytest.mark.skipif( + TRANSFORMERS_VERSION == "4.55.0", + reason="Transformers v4.55.0 has a regression issue on mllama, " + "see: https://github.com/huggingface/transformers/pull/40083") def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, dtype, max_tokens, num_logprobs, attn_backend: _Backend) -> None: @@ -402,6 +415,10 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.skipif( + TRANSFORMERS_VERSION == "4.55.0", + reason="Transformers v4.55.0 has a regression issue on mllama, " + "see: https://github.com/huggingface/transformers/pull/40083") def test_models_distributed( hf_runner, vllm_runner, From 807d21b80d11437f10dc3360ad8215f3ca6eb2e8 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Mon, 11 Aug 2025 10:31:36 -0700 Subject: [PATCH 170/932] [BugFix] [Spec Decode] Remove LlamaForCausalLMEagle3 to fix CI (#22611) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- tests/models/registry.py | 9 ++--- tests/v1/e2e/test_spec_decode.py | 45 +++++++++++++----------- vllm/model_executor/models/registry.py | 3 +- vllm/transformers_utils/configs/eagle.py | 2 +- 4 files changed, 32 insertions(+), 27 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 898e38a4ae..c5816df25b 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -525,10 +525,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { trust_remote_code=True, speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", tokenizer="meta-llama/Llama-3.1-8B-Instruct"), - "LlamaForCausalLMEagle3": _HfExamplesInfo("AngelSlim/Qwen3-8B_eagle3", # noqa: E501 - trust_remote_code=True, - speculative_model="AngelSlim/Qwen3-8B_eagle3", - tokenizer="Qwen/Qwen3-8B"), + # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501 + # "LlamaForCausalLMEagle3": _HfExamplesInfo("AngelSlim/Qwen3-8B_eagle3", # noqa: E501 + # trust_remote_code=True, + # speculative_model="AngelSlim/Qwen3-8B_eagle3", # noqa: E501 + # tokenizer="Qwen/Qwen3-8B"), "EagleLlama4ForCausalLM": _HfExamplesInfo( "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", trust_remote_code=True, diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index cd383b58db..599916c0d1 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -125,27 +125,30 @@ def test_ngram_correctness( cleanup_dist_env_and_memory() -@pytest.mark.parametrize(["model_setup", "mm_enabled"], [ - (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False), - (("eagle", "meta-llama/Llama-3.1-8B-Instruct", - "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False), - (("eagle3", "meta-llama/Llama-3.1-8B-Instruct", - "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False), - pytest.param( - ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct", - "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), - False, - marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), - pytest.param( - ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct", - "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), - True, - marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), -], - ids=[ - "qwen3_eagle3", "llama3_eagle", "llama3_eagle3", - "llama4_eagle", "llama4_eagle_mm" - ]) +@pytest.mark.parametrize( + ["model_setup", "mm_enabled"], + [ + # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501 + # (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False), + (("eagle", "meta-llama/Llama-3.1-8B-Instruct", + "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False), + (("eagle3", "meta-llama/Llama-3.1-8B-Instruct", + "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False), + pytest.param( + ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), + False, + marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), + pytest.param( + ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), + True, + marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), + ], + ids=[ + "qwen3_eagle3", "llama3_eagle", "llama3_eagle3", "llama4_eagle", + "llama4_eagle_mm" + ]) @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform()) def test_eagle_correctness( diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 1b0c902c5e..870704c64d 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -259,7 +259,8 @@ _SPECULATIVE_DECODING_MODELS = { "EagleLlama4ForCausalLM": ("llama4_eagle", "EagleLlama4ForCausalLM"), "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"), "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), - "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"), + # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501 + # "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"), "MedusaModel": ("medusa", "Medusa"), diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py index 01217eb191..bc249c5836 100644 --- a/vllm/transformers_utils/configs/eagle.py +++ b/vllm/transformers_utils/configs/eagle.py @@ -45,7 +45,7 @@ class EAGLEConfig(PretrainedConfig): # Eagle model name should follow naming convention of # LlamaForCausalLM -> EagleLlamaForCausalLM - # LlamaForCausalLM -> Eagle3LlamaForCausalLM / LlamaForCausalLMEagle3 + # LlamaForCausalLM -> Eagle3LlamaForCausalLM if method == "eagle": assert self.model is not None, \ "model should not be None when method is eagle" From 65abe111a3035d3bf70dce217ba4e1889aa20dc3 Mon Sep 17 00:00:00 2001 From: TJian Date: Mon, 11 Aug 2025 10:36:05 -0700 Subject: [PATCH 171/932] [CI] Skip Tree Attn Test in `test_max_len.py` to unblock CI (#22664) Signed-off-by: tjtanaa --- tests/v1/spec_decode/test_max_len.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py index fef6a5421b..01019b29e0 100644 --- a/tests/v1/spec_decode/test_max_len.py +++ b/tests/v1/spec_decode/test_max_len.py @@ -40,6 +40,11 @@ def test_eagle_max_len(monkeypatch: pytest.MonkeyPatch, with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") + if attn_backend == "TREE_ATTN" and num_speculative_tokens > 1: + # TREE_ATTN fails the test with multi-token spec decode + # TODO: Investigate why + pytest.skip("TREE_ATTN fails the test") + m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) if (attn_backend == "TRITON_ATTN_VLLM_V1" From 458e74eb907f96069e6d8a4f3c9f457001fef2ea Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 11 Aug 2025 18:42:48 +0100 Subject: [PATCH 172/932] Support more parallel styles in Transformers backend TP (#22651) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/transformers.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index fc4585618b..25b8b69e08 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -107,10 +107,17 @@ def replace_linear_class( raise ValueError( f"Unsupported parallel style type {type(style)}, expected str") - vllm_linear_cls = { - "colwise": ColumnParallelLinear, - "rowwise": RowParallelLinear, - }.get(style, ReplicatedLinear) + vllm_linear_cls, vllm_linear_kwargs = { + "colwise": (ColumnParallelLinear, {}), + "colwise_rep": (ColumnParallelLinear, { + "gather_output": True + }), + "rowwise": (RowParallelLinear, {}), + "rowwise_rep": (RowParallelLinear, { + "input_is_parallel": False + }), + "replicate": (ReplicatedLinear, {}), + }.get(style, (ReplicatedLinear, {})) return vllm_linear_cls( input_size=linear.in_features, @@ -118,6 +125,7 @@ def replace_linear_class( bias=linear.bias is not None, quant_config=quant_config, return_bias=False, + **vllm_linear_kwargs, ) @@ -506,7 +514,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): # Some weight loaders expect linear layers to inherit from vLLM's # LinearBase class, so we set a default style which causes any # unspecified linear layers to be replaced with ReplicatedLinear - tp_plan[".*"] = "replicated" + tp_plan[".*"] = "replicate" def _tensor_parallel(module: nn.Module, prefix: str = ""): for child_name, child_module in module.named_children(): From 95a935fc48563ec63de02a65d41fd2d7cb1d9ea5 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Mon, 11 Aug 2025 17:46:59 -0700 Subject: [PATCH 173/932] [gpt-oss] Support streaming in response API (#22431) Signed-off-by: Chen Zhang --- vllm/entrypoints/openai/serving_responses.py | 450 ++++++++++++++++++- 1 file changed, 445 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 1e3746e956..089f50a1e6 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import json import time from collections.abc import AsyncGenerator, AsyncIterator from contextlib import AsyncExitStack @@ -10,10 +11,22 @@ from http import HTTPStatus from typing import Any, Callable, Final, Optional, Union import jinja2 +import openai.types.responses as openai_responses_types from fastapi import Request -from openai.types.responses import (ResponseFunctionToolCall, - ResponseOutputItem, ResponseOutputMessage, - ResponseOutputText, ResponseReasoningItem) +from openai import BaseModel +# yapf conflicts with isort for this block +# yapf: disable +from openai.types.responses import (ResponseContentPartDoneEvent, + ResponseCreatedEvent, + ResponseFunctionToolCall, + ResponseInProgressEvent, + ResponseOutputItem, + ResponseOutputItemDoneEvent, + ResponseOutputMessage, ResponseOutputText, + ResponseReasoningItem, + ResponseReasoningTextDeltaEvent, + ResponseReasoningTextDoneEvent) +# yapf: enable from openai.types.responses.response_reasoning_item import ( Content as ResponseReasoningTextContent) from openai_harmony import Message as OpenAIHarmonyMessage @@ -330,8 +343,15 @@ class OpenAIServingResponses(OpenAIServing): return response if request.stream: - raise NotImplementedError( - "Streaming responses are not supported") + return self.responses_stream_generator( + request, + sampling_params, + result_generator, + context, + model_name, + tokenizer, + request_metadata, + ) try: return await self.responses_full_generator( @@ -744,3 +764,423 @@ class OpenAIServingResponses(OpenAIServing): "starting the vLLM server."), status_code=HTTPStatus.BAD_REQUEST, ) + + async def responses_stream_generator( + self, + request: ResponsesRequest, + sampling_params: SamplingParams, + result_generator: AsyncIterator[Optional[ConversationContext]], + context: ConversationContext, + model_name: str, + tokenizer: AnyTokenizer, + request_metadata: RequestResponseMetadata, + created_time: Optional[int] = None, + ) -> AsyncGenerator[str, None]: + # TODO: + # 1. Handle disconnect + + if not isinstance(context, StreamingHarmonyContext): + raise NotImplementedError( + "Streaming is not supported for responses API without Harmony." + ) + + created_time = created_time or int(time.time()) + + sequence_number = 0 + + def _send_event(event: BaseModel): + nonlocal sequence_number + # Set sequence_number if the event has this attribute + if hasattr(event, 'sequence_number'): + event.sequence_number = sequence_number + sequence_number += 1 + # Get event type from the event's type field if it exists + event_type = getattr(event, 'type', 'unknown') + return (f"event: {event_type}\n" + f"data: {event.model_dump_json(indent=None)}\n\n") + + current_content_index = 0 # FIXME: this number is never changed + current_output_index = 0 + current_item_id = "" # FIXME: this number is never changed + sent_output_item_added = False + + initial_response = ResponsesResponse.from_request( + request, + sampling_params, + model_name=model_name, + created_time=created_time, + output=[], + status="in_progress", + usage=None, + ).model_dump() + yield _send_event( + ResponseCreatedEvent( + type="response.created", + sequence_number=-1, + response=initial_response, + )) + yield _send_event( + ResponseInProgressEvent( + type="response.in_progress", + sequence_number=-1, + response=initial_response, + )) + + async for ctx in result_generator: + + assert isinstance(ctx, StreamingHarmonyContext) + + if ctx.is_expecting_start(): + current_output_index += 1 + sent_output_item_added = False + + if len(ctx.parser.messages) > 0: + previous_item = ctx.parser.messages[-1] + if previous_item.recipient is not None: + # Deal with tool call here + pass + elif previous_item.channel == "analysis": + reasoning_item = ResponseReasoningItem( + type="reasoning", + content=[ + ResponseReasoningTextContent( + text=previous_item.content[0].text), + ], + status="completed", + ) + yield _send_event( + ResponseReasoningTextDoneEvent( + type="response.reasoning_text.done", + item_id=current_item_id, + sequence_number=-1, + output_index=current_output_index, + content_index=current_content_index, + text=previous_item.content[0].text, + )) + yield _send_event( + ResponseContentPartDoneEvent( + type="response.content_part.done", + item_id=current_item_id, + sequence_number=-1, + output_index=current_output_index, + content_index=current_content_index, + part=reasoning_item, + )) + yield _send_event( + ResponseOutputItemDoneEvent( + type="response.output_item.done", + sequence_number=-1, + output_index=current_output_index, + item=reasoning_item, + )) + elif previous_item.channel == "final": + text_content = ResponseOutputText( + type="output_text", + text=previous_item.content[0].text, + annotations=[], + ) + yield _send_event( + openai_responses_types.ResponseTextDoneEvent( + type="response.output_text.done", + sequence_number=-1, + output_index=current_output_index, + content_index=current_content_index, + text=previous_item.content[0].text, + logprobs=[], + item_id=current_item_id, + )) + yield _send_event( + openai_responses_types. + ResponseContentPartDoneEvent( + type="response.content_part.done", + sequence_number=-1, + item_id=current_item_id, + output_index=current_output_index, + content_index=current_content_index, + part=text_content, + )) + yield _send_event( + openai_responses_types.ResponseOutputItemDoneEvent( + type="response.output_item.done", + sequence_number=-1, + output_index=current_output_index, + item=ResponseOutputMessage( + id=current_item_id, + type="message", + role="assistant", + content=[text_content], + status="completed", + ), + )) + + if ctx.parser.last_content_delta: + if (ctx.parser.current_channel == "final" + and ctx.parser.current_recipient is None): + if not sent_output_item_added: + sent_output_item_added = True + yield _send_event( + openai_responses_types. + ResponseOutputItemAddedEvent( + type="response.output_item.added", + sequence_number=-1, + output_index=current_output_index, + item=openai_responses_types. + ResponseOutputMessage( + id=current_item_id, + type="message", + role="assistant", + content=[], + status="in_progress", + ), + )) + yield _send_event( + openai_responses_types. + ResponseContentPartAddedEvent( + type="response.content_part.added", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + content_index=current_content_index, + part=openai_responses_types.ResponseOutputText( + type="output_text", + text="", + annotations=[], + logprobs=[], + ), + )) + yield _send_event( + openai_responses_types.ResponseTextDeltaEvent( + type="response.output_text.delta", + sequence_number=-1, + content_index=current_content_index, + output_index=current_output_index, + item_id=current_item_id, + delta=ctx.parser.last_content_delta, + # TODO, use logprobs from ctx.last_request_output + logprobs=[], + )) + elif (ctx.parser.current_channel == "analysis" + and ctx.parser.current_recipient is None): + if not sent_output_item_added: + sent_output_item_added = True + yield _send_event( + openai_responses_types. + ResponseOutputItemAddedEvent( + type="response.output_item.added", + sequence_number=-1, + output_index=current_output_index, + item=openai_responses_types. + ResponseReasoningItem( + type="reasoning", + id=current_item_id, + summary=[], + status="in_progress", + ), + )) + yield _send_event( + openai_responses_types. + ResponseContentPartAddedEvent( + type="response.content_part.added", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + content_index=current_content_index, + part=openai_responses_types.ResponseOutputText( + type="output_text", + text="", + annotations=[], + logprobs=[], + ), + )) + yield _send_event( + ResponseReasoningTextDeltaEvent( + type="response.reasoning_text.delta", + item_id=current_item_id, + output_index=current_output_index, + content_index=current_content_index, + delta=ctx.parser.last_content_delta, + sequence_number=-1, + )) + + if ctx.is_assistant_action_turn() and len(ctx.parser.messages) > 0: + previous_item = ctx.parser.messages[-1] + if (self.tool_server is not None + and self.tool_server.has_tool("browser") + and previous_item.recipient is not None + and previous_item.recipient.startswith("browser.")): + function_name = previous_item.recipient[len("browser."):] + action = None + parsed_args = json.loads(previous_item.content[0].text) + if function_name == "search": + action = (openai_responses_types. + response_function_web_search.ActionSearch( + type="search", + query=parsed_args["query"], + )) + elif function_name == "open": + action = ( + openai_responses_types. + response_function_web_search.ActionOpenPage( + type="open_page", + # TODO: translate to url + url=f"cursor:{parsed_args.get('cursor', '')}", + )) + elif function_name == "find": + action = ( + openai_responses_types. + response_function_web_search.ActionFind( + type="find", + pattern=parsed_args["pattern"], + # TODO: translate to url + url=f"cursor:{parsed_args.get('cursor', '')}", + )) + else: + raise ValueError( + f"Unknown function name: {function_name}") + + yield _send_event( + openai_responses_types.ResponseOutputItemAddedEvent( + type="response.output_item.added", + sequence_number=-1, + output_index=current_output_index, + item=openai_responses_types. + response_function_web_search. + ResponseFunctionWebSearch( + # TODO: generate a unique id for web search call + type="web_search_call", + id=current_item_id, + action=action, + status="in_progress", + ), + )) + yield _send_event( + openai_responses_types. + ResponseWebSearchCallInProgressEvent( + type="response.web_search_call.in_progress", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + )) + yield _send_event( + openai_responses_types. + ResponseWebSearchCallSearchingEvent( + type="response.web_search_call.searching", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + )) + + # enqueue + yield _send_event( + openai_responses_types. + ResponseWebSearchCallCompletedEvent( + type="response.web_search_call.completed", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + )) + yield _send_event( + openai_responses_types.ResponseOutputItemDoneEvent( + type="response.output_item.done", + sequence_number=-1, + output_index=current_output_index, + item=openai_responses_types. + ResponseFunctionWebSearch( + type="web_search_call", + id=current_item_id, + action=action, + status="completed", + ), + )) + + if (self.tool_server is not None + and self.tool_server.has_tool("python") + and previous_item.recipient is not None + and previous_item.recipient.startswith("python")): + yield _send_event( + openai_responses_types.ResponseOutputItemAddedEvent( + type="response.output_item.added", + sequence_number=-1, + output_index=current_output_index, + item=openai_responses_types. + ResponseCodeInterpreterToolCallParam( + type="code_interpreter_call", + id=current_item_id, + code="", + container_id="auto", + outputs=[], + status="in_progress", + ), + )) + yield _send_event( + openai_responses_types. + ResponseCodeInterpreterCallInProgressEvent( + type="response.code_interpreter_call.in_progress", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + )) + # TODO: do we need to add delta event here? + yield _send_event( + openai_responses_types. + ResponseCodeInterpreterCallCodeDoneEvent( + type="response.code_interpreter_call_code.done", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + code=previous_item.content[0].text)) + yield _send_event( + openai_responses_types. + ResponseCodeInterpreterCallInterpretingEvent( + type="response.code_interpreter_call.interpreting", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + )) + yield _send_event( + openai_responses_types. + ResponseCodeInterpreterCallCompletedEvent( + type="response.code_interpreter_call.completed", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + )) + yield _send_event( + openai_responses_types.ResponseOutputItemDoneEvent( + type="response.output_item.done", + sequence_number=-1, + output_index=current_output_index, + item=openai_responses_types. + ResponseCodeInterpreterToolCallParam( + type="code_interpreter_call", + id=current_item_id, + code=previous_item.content[0].text, + container_id="auto", + # TODO: add outputs here + outputs=[], + status="completed", + ), + )) + + async def empty_async_generator(): + # A hack to trick Python to think this is a generator but in fact + # it immediately returns. + if False: + yield + + final_response = await self.responses_full_generator( + request, + sampling_params, + empty_async_generator(), + context, + model_name, + tokenizer, + request_metadata, + created_time=created_time, + ) + yield _send_event( + openai_responses_types.ResponseCompletedEvent( + type="response.completed", + sequence_number=-1, + response=final_response.model_dump(), + )) From 1891a265d316217f9c1e552cf7c380ef5bf1eec1 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Mon, 11 Aug 2025 17:47:24 -0700 Subject: [PATCH 174/932] [gpt-oss] Add test for response API + harmony (but skipped) (#22554) Signed-off-by: Chen Zhang --- .../openai/test_response_api_with_harmony.py | 624 ++++++++++++++++++ 1 file changed, 624 insertions(+) create mode 100644 tests/entrypoints/openai/test_response_api_with_harmony.py diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py new file mode 100644 index 0000000000..1ca52599c5 --- /dev/null +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -0,0 +1,624 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +import time + +import pytest +import pytest_asyncio +import requests +from openai import BadRequestError, NotFoundError, OpenAI + +from ...utils import RemoteOpenAIServer + +pytest.skip(allow_module_level=True, reason="gpt-oss can't run on CI yet.") + +MODEL_NAME = "openai/gpt-oss-20b" +DTYPE = "bfloat16" + + +@pytest.fixture(scope="module") +def server(): + args = ["--enforce-eager", "--tool-server", "demo"] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_basic(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="What is 13 * 24?", + ) + assert response is not None + print("response: ", response) + assert response.status == "completed" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_basic_with_instructions(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="What is 13 * 24?", + instructions="Respond in Korean.", + ) + assert response is not None + assert response.status == "completed" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="What is the capital of South Korea?", + reasoning={"effort": "low"}, + ) + assert response is not None + assert response.status == "completed" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_chat(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input=[ + { + "role": "system", + "content": "Respond in Korean." + }, + { + "role": "user", + "content": "Hello!" + }, + { + "role": "assistant", + "content": "Hello! How can I help you today?" + }, + { + "role": "user", + "content": "What is 13 * 24? Explain your answer." + }, + ], + ) + assert response is not None + assert response.status == "completed" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_chat_with_input_type(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input=[ + { + "role": "user", + "content": [{ + "type": "input_text", + "text": "What is 13*24?" + }], + }, + ], + ) + assert response is not None + assert response.status == "completed" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_structured_output(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input=[ + { + "role": "system", + "content": "Extract the event information." + }, + { + "role": "user", + "content": + "Alice and Bob are going to a science fair on Friday.", + }, + ], + text={ + "format": { + "type": "json_schema", + "name": "calendar_event", + "schema": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "date": { + "type": "string" + }, + "participants": { + "type": "array", + "items": { + "type": "string" + } + }, + }, + "required": ["name", "date", "participants"], + "additionalProperties": False, + }, + "description": "A calendar event.", + "strict": True, + } + }, + ) + assert response is not None + assert response.status == "completed" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_structured_output_with_parse(client: OpenAI, model_name: str): + from pydantic import BaseModel + + class CalendarEvent(BaseModel): + name: str + date: str + participants: list[str] + + response = await client.responses.parse( + model=model_name, + input="Alice and Bob are going to a science fair on Friday", + instructions="Extract the event information", + text_format=CalendarEvent, + ) + assert response is not None + assert response.status == "completed" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_store(client: OpenAI, model_name: str): + for store in [True, False]: + response = await client.responses.create( + model=model_name, + input="What is 13 * 24?", + store=store, + ) + assert response is not None + + try: + _retrieved_response = await client.responses.retrieve(response.id) + is_not_found = False + except NotFoundError: + is_not_found = True + + assert is_not_found == (not store) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_background(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="What is 13 * 24?", + background=True, + ) + assert response is not None + + retries = 0 + max_retries = 30 + while retries < max_retries: + response = await client.responses.retrieve(response.id) + if response.status == "completed": + break + time.sleep(1) + retries += 1 + + assert response.status == "completed" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_background_cancel(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="Write a long story about a cat.", + background=True, + ) + assert response is not None + time.sleep(1) + + cancelled_response = await client.responses.cancel(response.id) + assert cancelled_response is not None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_stateful_multi_turn(client: OpenAI, model_name: str): + response1 = await client.responses.create( + model=model_name, + input="What is 13 * 24?", + ) + assert response1 is not None + assert response1.status == "completed" + + response2 = await client.responses.create( + model=model_name, + input="What if I increase both numbers by 1?", + previous_response_id=response1.id, + ) + assert response2 is not None + assert response2.status == "completed" + + response3 = await client.responses.create( + model=model_name, + input="Divide the result by 2.", + previous_response_id=response2.id, + ) + assert response3 is not None + assert response3.status == "completed" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_streaming(client: OpenAI, model_name: str): + prompts = [ + "tell me a story about a cat in 20 words", + "What is 13 * 24? Use python to calculate the result.", + "When did Jensen found NVIDIA? Search it and answer the year only.", + ] + + for prompt in prompts: + response = await client.responses.create( + model=model_name, + input=prompt, + reasoning={"effort": "low"}, + tools=[ + { + "type": "web_search_preview" + }, + { + "type": "code_interpreter", + "container": { + "type": "auto" + } + }, + ], + stream=True, + ) + + events = [] + current_event_mode = None + async for event in response: + if current_event_mode != event.type: + current_event_mode = event.type + print(f"\n[{event.type}] ", end="", flush=True) + + if "text.delta" in event.type: + print(event.delta, end="", flush=True) + elif "reasoning_text.delta" in event.type: + print(f"{event.delta}", end="", flush=True) + elif "response.code_interpreter_call_code.done" in event.type: + print(f"Code: {event.code}", end="", flush=True) + elif ("response.output_item.added" in event.type + and event.item.type == "web_search_call"): + print(f"Web search: {event.item.action}", end="", flush=True) + events.append(event) + + assert len(events) > 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_web_search(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="Who is the president of South Korea as of now?", + tools=[{ + "type": "web_search_preview" + }], + ) + assert response is not None + assert response.status == "completed" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_code_interpreter(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="Multiply 64548*15151 using builtin python interpreter.", + tools=[{ + "type": "code_interpreter", + "container": { + "type": "auto" + } + }], + ) + assert response is not None + assert response.status == "completed" + + +def get_weather(latitude, longitude): + response = requests.get( + f"https://api.open-meteo.com/v1/forecast?latitude={latitude}&longitude={longitude}¤t=temperature_2m,wind_speed_10m&hourly=temperature_2m,relative_humidity_2m,wind_speed_10m" # noqa + ) + data = response.json() + return data["current"]["temperature_2m"] + + +def get_place_to_travel(): + return "Paris" + + +def call_function(name, args): + if name == "get_weather": + return get_weather(**args) + elif name == "get_place_to_travel": + return get_place_to_travel() + else: + raise ValueError(f"Unknown function: {name}") + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_function_calling(client: OpenAI, model_name: str): + tools = [{ + "type": "function", + "name": "get_weather", + "description": + "Get current temperature for provided coordinates in celsius.", # noqa + "parameters": { + "type": "object", + "properties": { + "latitude": { + "type": "number" + }, + "longitude": { + "type": "number" + }, + }, + "required": ["latitude", "longitude"], + "additionalProperties": False, + }, + "strict": True, + }] + + response = await client.responses.create( + model=model_name, + input="What's the weather like in Paris today?", + tools=tools, + ) + assert response is not None + assert response.status == "completed" + assert len(response.output) == 2 + assert response.output[0].type == "reasoning" + assert response.output[1].type == "function_call" + + tool_call = response.output[1] + name = tool_call.name + args = json.loads(tool_call.arguments) + + result = call_function(name, args) + + response_2 = await client.responses.create( + model=model_name, + input=[{ + "type": "function_call_output", + "call_id": tool_call.call_id, + "output": str(result), + }], + tools=tools, + previous_response_id=response.id, + ) + assert response_2 is not None + assert response_2.status == "completed" + assert response_2.output_text is not None + + # NOTE: chain-of-thought should be removed. + response_3 = await client.responses.create( + model=model_name, + input="What's the weather like in Paris today?", + tools=tools, + previous_response_id=response_2.id, + ) + assert response_3 is not None + assert response_3.status == "completed" + assert response_3.output_text is not None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_function_calling_multi_turn(client: OpenAI, model_name: str): + tools = [ + { + "type": "function", + "name": "get_place_to_travel", + "description": "Get a random place to travel", + "parameters": { + "type": "object", + "properties": {}, + "required": [], + "additionalProperties": False, + }, + "strict": True, + }, + { + "type": "function", + "name": "get_weather", + "description": + "Get current temperature for provided coordinates in celsius.", # noqa + "parameters": { + "type": "object", + "properties": { + "latitude": { + "type": "number" + }, + "longitude": { + "type": "number" + }, + }, + "required": ["latitude", "longitude"], + "additionalProperties": False, + }, + "strict": True, + }, + ] + + response = await client.responses.create( + model=model_name, + input= + "Help me plan a trip to a random place. And tell me the weather there.", + tools=tools, + ) + assert response is not None + assert response.status == "completed" + assert len(response.output) == 2 + assert response.output[0].type == "reasoning" + assert response.output[1].type == "function_call" + + tool_call = response.output[1] + name = tool_call.name + args = json.loads(tool_call.arguments) + + result = call_function(name, args) + + response_2 = await client.responses.create( + model=model_name, + input=[{ + "type": "function_call_output", + "call_id": tool_call.call_id, + "output": str(result), + }], + tools=tools, + previous_response_id=response.id, + ) + assert response_2 is not None + assert response_2.status == "completed" + assert len(response_2.output) == 2 + assert response_2.output[0].type == "reasoning" + assert response_2.output[1].type == "function_call" + + tool_call = response_2.output[1] + name = tool_call.name + args = json.loads(tool_call.arguments) + + result = call_function(name, args) + + response_3 = await client.responses.create( + model=model_name, + input=[{ + "type": "function_call_output", + "call_id": tool_call.call_id, + "output": str(result), + }], + tools=tools, + previous_response_id=response_2.id, + ) + assert response_3 is not None + assert response_3.status == "completed" + assert response_3.output_text is not None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_function_calling_required(client: OpenAI, model_name: str): + tools = [{ + "type": "function", + "name": "get_weather", + "description": + "Get current temperature for provided coordinates in celsius.", # noqa + "parameters": { + "type": "object", + "properties": { + "latitude": { + "type": "number" + }, + "longitude": { + "type": "number" + }, + }, + "required": ["latitude", "longitude"], + "additionalProperties": False, + }, + "strict": True, + }] + + with pytest.raises(BadRequestError): + await client.responses.create( + model=model_name, + input="What's the weather like in Paris today?", + tools=tools, + tool_choice="required", + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_function_calling_full_history(client: OpenAI, model_name: str): + tools = [{ + "type": "function", + "name": "get_weather", + "description": + "Get current temperature for provided coordinates in celsius.", # noqa + "parameters": { + "type": "object", + "properties": { + "latitude": { + "type": "number" + }, + "longitude": { + "type": "number" + }, + }, + "required": ["latitude", "longitude"], + "additionalProperties": False, + }, + "strict": True, + }] + + input_messages = [{ + "role": "user", + "content": "What's the weather like in Paris today?" + }] + + response = await client.responses.create( + model=model_name, + input=input_messages, + tools=tools, + ) + + assert response is not None + assert response.status == "completed" + + tool_call = response.output[-1] + name = tool_call.name + args = json.loads(tool_call.arguments) + + result = call_function(name, args) + + input_messages.extend( + response.output) # append model's function call message + input_messages.append( + { # append result message + "type": "function_call_output", + "call_id": tool_call.call_id, + "output": str(result), + } + ) + + response_2 = await client.responses.create( + model=model_name, + input=input_messages, + tools=tools, + ) + assert response_2 is not None + assert response_2.status == "completed" + assert response_2.output_text is not None From 9b94d6ec8f5f2e8ec2d3897ed05fb2b13cc012da Mon Sep 17 00:00:00 2001 From: Andy Chen <37168711+py-andy-c@users.noreply.github.com> Date: Mon, 11 Aug 2025 19:02:14 -0700 Subject: [PATCH 175/932] Enable 4bit bnb prequant MOE (#21548) Signed-off-by: Jee Jee Li Co-authored-by: Jee Jee Li --- .../model_executor/model_loader/bitsandbytes_loader.py | 10 +++------- vllm/model_executor/models/qwen3_moe.py | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index ea2fb2e3ac..b8393956ee 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -427,14 +427,10 @@ class BitsAndBytesModelLoader(BaseModelLoader): elif isinstance(module, FusedMoE) and hasattr( module.quant_method, "quant_config"): # TODO: support FusedMoE with prequant and 8bit. - if self.pre_quant: + if self.pre_quant and self.load_8bit: raise ValueError( - "Prequant BitsAndBytes models with FusedMoE is not " - "supported yet.") - if self.load_8bit: - raise ValueError( - "BitsAndBytes 8bit quantization with FusedMoE is not " - "supported yet.") + "Prequant BitsAndBytes 8bit models with FusedMoE " + "is not supported yet.") # Get the corresponding weight name using module name and # expert_params_mapping. diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 9b49952f37..085fc90b47 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -684,4 +684,4 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, return loader.load_weights(weights) def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return self.model.get_expert_mapping() + return self.model.get_expert_mapping() \ No newline at end of file From 839ab0034932e5e6863a8d837e5b04944fa0cac5 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 12 Aug 2025 03:54:40 +0100 Subject: [PATCH 176/932] Re-enable Xet on TPU tests now that `hf_xet` has been updated (#22666) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh | 2 +- .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh | 2 +- tests/entrypoints/llm/test_accuracy.py | 3 --- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh index 10d2e23649..b571618f48 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -128,7 +128,7 @@ run_and_track_test() { # --- Actual Test Execution --- run_and_track_test 1 "test_struct_output_generate.py" \ - "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" + "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" run_and_track_test 2 "test_moe_pallas.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" run_and_track_test 3 "test_lora.py" \ diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 9e7b5a5462..d55a786e41 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -134,7 +134,7 @@ run_and_track_test 1 "test_compilation.py" \ run_and_track_test 2 "test_basic.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py" run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \ - "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine" + "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine" run_and_track_test 4 "test_quantization_accuracy.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py" run_and_track_test 5 "examples/offline_inference/tpu.py" \ diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py index 39bc8ab07d..5d605e906e 100644 --- a/tests/entrypoints/llm/test_accuracy.py +++ b/tests/entrypoints/llm/test_accuracy.py @@ -96,9 +96,6 @@ def test_lm_eval_accuracy_v1_engine_fp8_kv_cache( more_args = None if current_platform.is_tpu(): # Limit compilation time for TPU V1 - - # xet doesn't work well for Qwen/Qwen3-1.7B - m.setenv("HF_HUB_DISABLE_XET", "1") more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8" # Add TP test (if provided) From dc5e4a653c859573dfcca99f1b0141c2db9f94cc Mon Sep 17 00:00:00 2001 From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com> Date: Tue, 12 Aug 2025 10:58:41 +0800 Subject: [PATCH 177/932] Upgrade FlashInfer to v0.2.11 (#22613) Signed-off-by: Po-Han Huang Co-authored-by: mgoin --- docker/Dockerfile | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 85f55cac8d..b96d50f0a1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -387,7 +387,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" # Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt # We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel. -ARG FLASHINFER_GIT_REF="v0.2.10" +ARG FLASHINFER_GIT_REF="v0.2.11" RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' . /etc/environment git clone --depth 1 --recursive --shallow-submodules \ diff --git a/setup.py b/setup.py index 7f6c787129..919300e143 100644 --- a/setup.py +++ b/setup.py @@ -684,7 +684,7 @@ setup( "mistral_common[audio]"], # Required for audio processing "video": [], # Kept for backwards compatibility # FlashInfer should be updated together with the Dockerfile - "flashinfer": ["flashinfer-python==0.2.10"], + "flashinfer": ["flashinfer-python==0.2.11"], }, cmdclass=cmdclass, package_data=package_data, From ea1292ad3ee724e44b3dfec2a26778cd614729f9 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 11 Aug 2025 23:20:42 -0400 Subject: [PATCH 178/932] [CI Failure] Use float32 for tests/entrypoints/openai/test_audio.py (#22686) Signed-off-by: mgoin --- tests/entrypoints/openai/test_audio.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index d67c05ab3e..2d33d3c3a6 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -23,6 +23,8 @@ MAXIMUM_AUDIOS = 2 @pytest.fixture(scope="module") def server(): args = [ + "--dtype", + "float32", "--max-model-len", "2048", "--max-num-seqs", From 93d0652433f9385959d5296a4dc1c98ec58f0d58 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 11 Aug 2025 23:31:36 -0400 Subject: [PATCH 179/932] [CI] Increase timeout for test_completion_with_image_embeds (#22670) Signed-off-by: mgoin --- .../v1/entrypoints/openai/test_completion_with_image_embeds.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py index be98be8d14..41f1d02bf7 100644 --- a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py +++ b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py @@ -38,7 +38,8 @@ def default_image_embeds_server_args() -> list[str]: @pytest.fixture(scope="module") def server_with_image_embeds(default_image_embeds_server_args): with RemoteOpenAIServer(MODEL_NAME, - default_image_embeds_server_args) as remote_server: + default_image_embeds_server_args, + max_wait_seconds=600) as remote_server: yield remote_server From 467850347687f0ef76c1a57d79e2c0639eaa1456 Mon Sep 17 00:00:00 2001 From: Benji Beck Date: Mon, 11 Aug 2025 20:43:37 -0700 Subject: [PATCH 180/932] Migrate MiniCPMVImageInputs to TensorSchema (#21939) Signed-off-by: Benji Beck --- vllm/model_executor/models/minicpmv.py | 65 ++++++++++++++------------ 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 3aa16bb9ab..7db3a1bb90 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -27,7 +27,7 @@ import math from collections import defaultdict from collections.abc import Iterable, Mapping, Sequence from functools import partial -from typing import Any, Callable, Literal, Optional, TypedDict, Union +from typing import Annotated, Any, Callable, Literal, Optional, Union import numpy as np import torch @@ -63,6 +63,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.utils import flatten_2d_lists +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .idefics2_vision_model import Idefics2VisionTransformer from .interfaces import (MultiModalEmbeddings, SupportsLoRA, @@ -74,36 +75,47 @@ from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, _MAX_FRAMES_PER_VIDEO = 16 -class MiniCPMVImagePixelInputs(TypedDict): - type: Literal["pixel_values"] - pixel_values: list[torch.Tensor] +class MiniCPMVImagePixelInputs(TensorSchema): """ - Shape: `(batch_size * num_images * num_slices, num_channels, height, width)` - - Note that the image size may vary, so we pass it as a list - instead of a batched tensor. + Dimensions: + - bns: Batch size * number of images * number of slices + - bn: Batch size * number of images + - c: Number of channels + - h: Height + - w: Width """ - tgt_sizes: torch.Tensor - """ - Shape: `(batch_size * num_images * num_slices, 2)` + type: Literal["pixel_values"] = "pixel_values" - This should be in `(height, width)` format. + # Note that the image size may vary, so we pass it as a list instead of a + # batched tensor. + pixel_values: Annotated[ + list[torch.Tensor], + TensorShape("bns", "c", "h", "w"), + ] + tgt_sizes: Annotated[ + torch.Tensor, + TensorShape("bns", 2), # This should be in `(height, width)` format. + ] + num_slices: Annotated[ + torch.Tensor, + TensorShape("bn"), + ] + + +class MiniCPMVImageEmbeddingInputs(TensorSchema): + """ + Dimensions: + - bn: Batch size * number of images + - ns: Number of slices + - hs: Hidden size (must match language model backbone) """ - num_slices: torch.Tensor - """Shape: `(batch_size * num_images)`""" - - -class MiniCPMVImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - image_embeds: Union[torch.Tensor, list[torch.Tensor]] - """ - Shape: `(batch_size * num_images, num_slices, hidden_size)` - - `hidden_size` must match the hidden size of language model backbone. - instead of a batched tensor. - """ + image_embeds: Annotated[ + Union[torch.Tensor, list[torch.Tensor]], + TensorShape("bn", "ns", "hs"), + ] MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs, @@ -832,11 +844,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): pixel_values_flat = flatten_bn(flatten_2d_lists(pixel_values)) tgt_sizes_flat = flatten_bn(flatten_2d_lists(tgt_sizes), concat=True) - if len(pixel_values_flat) != len(tgt_sizes_flat): - raise ValueError("Inconsistent flattened lengths, found: " - f"{len(pixel_values_flat)} vs. " - f"{len(tgt_sizes_flat)}") - return MiniCPMVImagePixelInputs( type="pixel_values", pixel_values=pixel_values_flat, From bbaf9e9cb15af23e7a1fd250bf49a5efb15cadf7 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Mon, 11 Aug 2025 21:22:26 -0700 Subject: [PATCH 181/932] [gpt-oss] Fix mxfp4 support (#22700) Signed-off-by: Chen Zhang --- vllm/model_executor/layers/quantization/utils/mxfp4_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index 4084dd837c..95eabe149d 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -68,7 +68,7 @@ def _can_support_mxfp4(use_grouped_topk: bool = False, return not (use_grouped_topk or topk_group or num_expert_group or expert_map or custom_routing_function or e_score_correction_bias or apply_router_weight_on_input - or scoring_func != "softmax" or activation != "silu" + or scoring_func != "softmax" or activation != "swiglu_oai" or expert_load_view or logical_to_physical_map or logical_replica_count) From ad344ef552ece428d12e04fbcb64b8b50768283b Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Mon, 11 Aug 2025 22:04:38 -0700 Subject: [PATCH 182/932] [gpt-oss] Small bug fixes for frontend (#22512) Signed-off-by: Chen Zhang --- vllm/entrypoints/context.py | 54 +++++++++++++++----- vllm/entrypoints/openai/protocol.py | 5 +- vllm/entrypoints/openai/serving_responses.py | 29 ++++++----- vllm/entrypoints/tool.py | 15 +++++- vllm/entrypoints/tool_server.py | 5 +- 5 files changed, 76 insertions(+), 32 deletions(-) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 6292306e7c..e817f07ef5 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -1,15 +1,20 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json import logging from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Union -from openai_harmony import Message, Role, StreamState +from openai_harmony import Author, Message, Role, StreamState, TextContent from vllm.entrypoints.harmony_utils import ( get_encoding, get_streamable_parser_for_assistant, render_for_completion) from vllm.entrypoints.tool import Tool from vllm.outputs import RequestOutput +if TYPE_CHECKING: + from mcp.client import ClientSession + logger = logging.getLogger(__name__) @@ -71,6 +76,7 @@ class HarmonyContext(ConversationContext): def append_output(self, output) -> None: if isinstance(output, RequestOutput): output_token_ids = output.outputs[0].token_ids + self.parser = get_streamable_parser_for_assistant() for token_id in output_token_ids: self.parser.process(token_id) output_msgs = self.parser.messages @@ -106,19 +112,41 @@ class HarmonyContext(ConversationContext): def render_for_completion(self) -> list[int]: return render_for_completion(self.messages) - async def call_search_tool( - self, - tool_session: Tool, - last_msg: Message, - ) -> list[Message]: - return await tool_session.get_result(self) + async def call_search_tool(self, tool_session: Union["ClientSession", + Tool], + last_msg: Message) -> list[Message]: + if isinstance(tool_session, Tool): + return await tool_session.get_result(self) + tool_name = last_msg.recipient.split(".")[1] + args = json.loads(last_msg.content[0].text) + result = await tool_session.call_tool(tool_name, args) + result_str = result.content[0].text + content = TextContent(text=result_str) + author = Author(role=Role.TOOL, name=last_msg.recipient) + return [ + Message(author=author, content=[content], recipient=Role.ASSISTANT) + ] - async def call_python_tool( - self, - tool_session: Tool, - last_msg: Message, - ) -> list[Message]: - return await tool_session.get_result(self) + async def call_python_tool(self, tool_session: Union["ClientSession", + Tool], + last_msg: Message) -> list[Message]: + if isinstance(tool_session, Tool): + return await tool_session.get_result(self) + param = { + "code": last_msg.content[0].text, + } + result = await tool_session.call_tool("python", param) + result_str = result.content[0].text + + content = TextContent(text=result_str) + author = Author(role=Role.TOOL, name="python") + + return [ + Message(author=author, + content=[content], + channel=last_msg.channel, + recipient=Role.ASSISTANT) + ] class StreamingHarmonyContext(HarmonyContext): diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 3b9f4b544e..543701ed14 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -19,8 +19,8 @@ from openai.types.chat.chat_completion_message import ( # yapf: enable from openai.types.responses import (ResponseFunctionToolCall, ResponseInputItemParam, ResponseOutputItem, - ResponsePrompt, ResponseStatus, - ResponseTextConfig) + ResponsePrompt, ResponseReasoningItem, + ResponseStatus, ResponseTextConfig) from openai.types.responses.response import ToolChoice from openai.types.responses.tool import Tool from openai.types.shared import Metadata, Reasoning @@ -239,6 +239,7 @@ def get_logits_processors(processors: Optional[LogitsProcessors], ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam, + ResponseReasoningItem, ResponseFunctionToolCall] diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 089f50a1e6..86c16df40e 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -16,8 +16,7 @@ from fastapi import Request from openai import BaseModel # yapf conflicts with isort for this block # yapf: disable -from openai.types.responses import (ResponseContentPartDoneEvent, - ResponseCreatedEvent, +from openai.types.responses import (ResponseCreatedEvent, ResponseFunctionToolCall, ResponseInProgressEvent, ResponseOutputItem, @@ -54,7 +53,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, # yapf: enable from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels -from vllm.entrypoints.tool_server import ToolServer +from vllm.entrypoints.tool_server import MCPToolServer, ToolServer from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.logger import init_logger from vllm.outputs import CompletionOutput @@ -238,6 +237,15 @@ class OpenAIServingResponses(OpenAIServing): if raw_request: raw_request.state.request_metadata = request_metadata + if self.tool_server is not None and isinstance( + self.tool_server, MCPToolServer + ) and (request.background or request.stream) and request.tools and any( + tool.type in ["web_search_preview", "code_interpreter"] + for tool in request.tools): + return self.create_error_response( + "MCP tool server is not supported in background mode and " + "streaming mode") + # Schedule the request and get the result generator. generators: list[AsyncGenerator[ConversationContext, None]] = [] @@ -844,9 +852,13 @@ class OpenAIServingResponses(OpenAIServing): type="reasoning", content=[ ResponseReasoningTextContent( - text=previous_item.content[0].text), + text=previous_item.content[0].text, + type="reasoning_text", + ), ], status="completed", + id=current_item_id, + summary=[], ) yield _send_event( ResponseReasoningTextDoneEvent( @@ -857,15 +869,6 @@ class OpenAIServingResponses(OpenAIServing): content_index=current_content_index, text=previous_item.content[0].text, )) - yield _send_event( - ResponseContentPartDoneEvent( - type="response.content_part.done", - item_id=current_item_id, - sequence_number=-1, - output_index=current_output_index, - content_index=current_content_index, - part=reasoning_item, - )) yield _send_event( ResponseOutputItemDoneEvent( type="response.output_item.done", diff --git a/vllm/entrypoints/tool.py b/vllm/entrypoints/tool.py index 01ee77414f..723cff91d4 100644 --- a/vllm/entrypoints/tool.py +++ b/vllm/entrypoints/tool.py @@ -2,7 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Optional + +from openai_harmony import Message from vllm.logger import init_logger @@ -70,7 +72,16 @@ class HarmonyPythonTool(Tool): "gpt_oss is not installed, code interpreter is disabled") return - self.python_tool = PythonTool() + # NOTE (Chen): as of gpt-oss 0.0.2, there is a bug in _make_response + # and we do the following monkey patch to fix it. + class PatchedGptOssPythonTool(PythonTool): + + def _make_response(self, + output: str, + channel: Optional[str] = None) -> Message: + return super()._make_response(output) + + self.python_tool = PatchedGptOssPythonTool() logger.info_once("Code interpreter tool initialized") async def get_result(self, context: "ConversationContext") -> Any: diff --git a/vllm/entrypoints/tool_server.py b/vllm/entrypoints/tool_server.py index 352704b2b3..2f28595f27 100644 --- a/vllm/entrypoints/tool_server.py +++ b/vllm/entrypoints/tool_server.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod from contextlib import AbstractAsyncContextManager, asynccontextmanager from typing import TYPE_CHECKING, Any, Optional -from openai_harmony import ToolNamespaceConfig +from openai_harmony import ToolDescription, ToolNamespaceConfig from vllm.entrypoints.tool import HarmonyBrowserTool, HarmonyPythonTool, Tool from vllm.logger import init_logger @@ -105,7 +105,6 @@ class MCPToolServer(ToolServer): self.harmony_tool_descriptions = {} async def add_tool_server(self, server_url: str): - from mcp.types import ToolDescription tool_urls = server_url.split(",") self.harmony_tool_descriptions = {} self.urls: dict[str, str] = {} @@ -133,6 +132,8 @@ class MCPToolServer(ToolServer): logger.warning( "Tool %s already exists. Ignoring duplicate tool server %s", tool_from_mcp.name, url) + logger.info("MCPToolServer initialized with tools: %s", + list(self.harmony_tool_descriptions.keys())) def has_tool(self, tool_name: str): return tool_name in self.harmony_tool_descriptions From 4fbd8bb597cf392b94def04a6955f22580356d76 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 12 Aug 2025 06:13:32 +0100 Subject: [PATCH 183/932] Fix passing `SpeculativeConfig` from the CLI (#22652) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/engine/arg_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 41a6da709b..d74db67bda 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -826,6 +826,10 @@ class EngineArgs: title="VllmConfig", description=VllmConfig.__doc__, ) + # We construct SpeculativeConfig using fields from other configs in + # create_engine_config. So we set the type to a JSON string here to + # delay the Pydantic validation that comes with SpeculativeConfig. + vllm_kwargs["speculative_config"]["type"] = optional_type(json.loads) vllm_group.add_argument("--speculative-config", **vllm_kwargs["speculative_config"]) vllm_group.add_argument("--kv-transfer-config", From 3a7e3bbdd255b470d37727a31cc0471aa0fe6ecb Mon Sep 17 00:00:00 2001 From: Hongsheng Liu Date: Tue, 12 Aug 2025 15:14:51 +0800 Subject: [PATCH 184/932] [Doc] Added unmentioned required option "method" in the usage of EAGLE-3 based models (#21737) Signed-off-by: Dilute-l Co-authored-by: Dilute-l --- docs/features/spec_decode.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md index 89d5b489e1..597a8e8644 100644 --- a/docs/features/spec_decode.md +++ b/docs/features/spec_decode.md @@ -203,6 +203,7 @@ an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B", "draft_tensor_parallel_size": 1, "num_speculative_tokens": 2, + "method": "eagle", }, ) @@ -231,6 +232,9 @@ A few important things to consider when using the EAGLE based draft models: reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under investigation and tracked here: . +4. When using EAGLE-3 based draft model, option "method" must be set to "eagle3". + That is, to specify `"method": "eagle3"` in `speculative_config`. + A variety of EAGLE draft models are available on the Hugging Face hub: | Base Model | EAGLE on Hugging Face | # EAGLE Parameters | From 2f4657952b1a118e616165e57af94c9007121fb8 Mon Sep 17 00:00:00 2001 From: Sooraj S <94284954+sooraj-satheesh@users.noreply.github.com> Date: Tue, 12 Aug 2025 12:51:08 +0530 Subject: [PATCH 185/932] [doc] Update x86 CPU-inference installation doc to reflect optionality of AVX512f (#22707) Signed-off-by: Sooraj S <94284954+sooraj-satheesh@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Li, Jiang --- docs/getting_started/installation/cpu/x86.inc.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/getting_started/installation/cpu/x86.inc.md b/docs/getting_started/installation/cpu/x86.inc.md index 49e223f9b9..6dc6f94249 100644 --- a/docs/getting_started/installation/cpu/x86.inc.md +++ b/docs/getting_started/installation/cpu/x86.inc.md @@ -6,7 +6,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data # --8<-- [start:requirements] - OS: Linux -- CPU flags: `avx512f`, `avx512_bf16` (Optional), `avx512_vnni` (Optional) +- CPU flags: `avx512f` (Recommended), `avx512_bf16` (Optional), `avx512_vnni` (Optional) !!! tip Use `lscpu` to check the CPU flags. @@ -28,7 +28,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo) !!! warning - If deploying the pre-built images on machines only contain `avx512f`, `Illegal instruction` error may be raised. It is recommended to build images for these machines with `--build-arg VLLM_CPU_AVX512BF16=false` and `--build-arg VLLM_CPU_AVX512VNNI=false`. + If deploying the pre-built images on machines without `avx512f`, `avx512_bf16`, or `avx512_vnni` support, an `Illegal instruction` error may be raised. It is recommended to build images for these machines with the appropriate build arguments (e.g., `--build-arg VLLM_CPU_DISABLE_AVX512=true`, `--build-arg VLLM_CPU_AVX512BF16=false`, or `--build-arg VLLM_CPU_AVX512VNNI=false`) to disable unsupported features. Please note that without `avx512f`, AVX2 will be used and this version is not recommended because it only has basic feature support. # --8<-- [end:pre-built-images] # --8<-- [start:build-image-from-source] @@ -37,6 +37,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data docker build -f docker/Dockerfile.cpu \ --build-arg VLLM_CPU_AVX512BF16=false (default)|true \ --build-arg VLLM_CPU_AVX512VNNI=false (default)|true \ + --build-arg VLLM_CPU_DISABLE_AVX512=false (default)|true \ --tag vllm-cpu-env \ --target vllm-openai . From 6d729c43fbaf63d534e71c0b8aa61f0a82dd2018 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 12 Aug 2025 15:23:17 +0800 Subject: [PATCH 186/932] [Bugfix] Fix ModernBert load & Enable sliding window attention for bidirectional attention. (#22637) Signed-off-by: wang.yuqi Signed-off-by: Max de Bayser Co-authored-by: Max de Bayser --- tests/models/language/pooling/test_gte.py | 21 ++++- vllm/model_executor/models/modernbert.py | 31 +++---- vllm/v1/attention/backends/flash_attn.py | 2 + vllm/v1/worker/gpu_model_runner.py | 106 ++++++++++++++-------- 4 files changed, 101 insertions(+), 59 deletions(-) diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 5a5fdfbb21..f805a64103 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -4,10 +4,11 @@ from typing import Any import pytest -from ...utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo, - LASTPoolingEmbedModelInfo, check_transformers_version) +from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo, + EmbedModelInfo, LASTPoolingEmbedModelInfo, + RerankModelInfo, check_transformers_version) from .embed_utils import correctness_test_embed_models -from .mteb_utils import mteb_test_embed_models +from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models MODELS = [ ########## BertModel @@ -58,6 +59,14 @@ MODELS = [ enable_test=False), ] +RERANK_MODELS = [ + # classifier_pooling: mean + CLSPoolingRerankModelInfo( + "Alibaba-NLP/gte-reranker-modernbert-base", + architecture="ModernBertForSequenceClassification", + enable_test=True), +] + @pytest.mark.parametrize("model_info", MODELS) def test_embed_models_mteb(hf_runner, vllm_runner, @@ -88,3 +97,9 @@ def test_embed_models_correctness(hf_runner, vllm_runner, correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts, vllm_extra_kwargs) + + +@pytest.mark.parametrize("model_info", RERANK_MODELS) +def test_rerank_models_mteb(hf_runner, vllm_runner, + model_info: RerankModelInfo) -> None: + mteb_test_rerank_models(hf_runner, vllm_runner, model_info) diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 2c3bdd1c93..c6e84e2d4e 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -26,8 +26,7 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors from vllm.tasks import PoolingTask -from .interfaces import (SupportsCrossEncoding, SupportsV0Only, - default_pooling_type) +from .interfaces import SupportsCrossEncoding, default_pooling_type from .utils import WeightsMapper, maybe_prefix @@ -93,16 +92,14 @@ class ModernBertAttention(nn.Module): bias=config.attention_bias, ) + sliding_window = None if layer_id % config.global_attn_every_n_layers != 0: - self.local_attention = (config.local_attention // 2, - config.local_attention // 2) + sliding_window = config.local_attention // 2 + rope_theta = config.local_rope_theta if config.local_rope_theta \ + is not None else config.global_rope_theta else: - self.local_attention = (-1, -1) + rope_theta = config.global_rope_theta - rope_theta = config.global_rope_theta - if self.local_attention != ( - -1, -1) and config.local_rope_theta is not None: - rope_theta = config.local_rope_theta self.rotary_emb = ModernBertRotaryEmbedding(config=config, head_size=self.head_dim, dim=self.head_dim, @@ -111,7 +108,8 @@ class ModernBertAttention(nn.Module): self.head_dim, self.scaling, prefix=f"{layer_id}.attn", - attn_type=AttentionType.ENCODER_ONLY) + attn_type=AttentionType.ENCODER_ONLY, + per_layer_sliding_window=sliding_window) self.Wo = RowParallelLinear(config.hidden_size, config.hidden_size, bias=config.attention_bias) @@ -278,6 +276,7 @@ class ModernBertPooler(Pooler): return self.pooling.get_pooling_updates(task) def _head(self, pooled_output: torch.Tensor): + pooled_output = pooled_output.to(self.dense.weight.dtype) return self.norm(self.act(self.dense(pooled_output))) def forward( @@ -296,8 +295,7 @@ class ModernBertPooler(Pooler): @default_pooling_type("CLS") -class ModernBertForSequenceClassification(nn.Module, SupportsV0Only, - SupportsCrossEncoding): +class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding): is_pooling_model = True @@ -308,6 +306,7 @@ class ModernBertForSequenceClassification(nn.Module, SupportsV0Only, self.model = ModernBertModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "modernbert")) self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.pooling = ModernBertPooler(config) pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None @@ -317,14 +316,14 @@ class ModernBertForSequenceClassification(nn.Module, SupportsV0Only, Pooler.for_encode(pooler_config), "classify": ClassifierPooler( - pooling=ModernBertPooler(config), + pooling=self.pooling, classifier=self.classifier, act_fn=ClassifierPooler.act_fn_for_seq_cls( vllm_config.model_config), ), "score": ClassifierPooler( - pooling=ModernBertPooler(config), + pooling=self.pooling, classifier=self.classifier, act_fn=ClassifierPooler.act_fn_for_cross_encoder( vllm_config.model_config), @@ -353,7 +352,7 @@ class ModernBertForSequenceClassification(nn.Module, SupportsV0Only, default_weight_loader) weight_loader(param, loaded_weight) if name.startswith("head"): - param = params_dict["_pooler.pooler." + name[len("head") + 1:]] + param = params_dict["pooling." + name[len("head") + 1:]] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) @@ -368,5 +367,5 @@ class ModernBertForSequenceClassification(nn.Module, SupportsV0Only, return self.model( input_ids=input_ids, inputs_embeds=inputs_embeds, - position_ids=positions, + positions=positions, ) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 95ba56b359..a411477bc3 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -384,6 +384,8 @@ class FlashAttentionImpl(AttentionImpl): self.alibi_slopes = alibi_slopes if sliding_window is None: self.sliding_window = (-1, -1) + elif attn_type == AttentionType.ENCODER_ONLY: + self.sliding_window = (sliding_window - 1, sliding_window - 1) else: self.sliding_window = (sliding_window - 1, 0) self.kv_cache_dtype = kv_cache_dtype diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 045a06d927..ed4d6bcb09 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -826,7 +826,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Prepare encoder attention metadata separately # (encoder layers are not in KV cache groups) if self.is_encoder_only_model: - common_attn_metadata, encoder_attn_metadata = \ + + per_layer_metadata = \ self._build_encoder_only_attn_metadata( scheduler_output) @@ -835,6 +836,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.vllm_config, Attention) for layer_name, attn_module in attention_layers.items(): if attn_module.attn_type == AttentionType.ENCODER_ONLY: + common_attn_metadata, encoder_attn_metadata =\ + per_layer_metadata[layer_name] attn_metadata[layer_name] = encoder_attn_metadata # Prepare the attention metadata for each KV cache group and make layers @@ -2683,30 +2686,41 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Check if model is encoder-only block_size = self.vllm_config.cache_config.block_size use_mla = self.vllm_config.model_config.use_mla - attn_specs = list[AttentionSpec]() - for attn_module in attn_layers.values(): + attn_specs: dict[AttentionSpec, list[str]] = defaultdict(list) + for layer_name, attn_module in attn_layers.items(): if attn_module.attn_type == AttentionType.ENCODER_ONLY: - assert attn_module.sliding_window is None, "Sliding " - "window attention is not supported for encoder-only models" + if attn_module.sliding_window is None: + attn_spec: AttentionSpec = FullAttentionSpec( + block_size=block_size, + num_kv_heads=attn_module.num_kv_heads, + head_size=attn_module.head_size, + dtype=self.kv_cache_dtype, + use_mla=use_mla) + else: + attn_spec = SlidingWindowSpec( + block_size=block_size, + num_kv_heads=attn_module.num_kv_heads, + head_size=attn_module.head_size, + dtype=self.kv_cache_dtype, + sliding_window=attn_module.sliding_window, + use_mla=use_mla) + attn_specs[attn_spec].append(layer_name) - attn_specs.append( - FullAttentionSpec(block_size=block_size, - num_kv_heads=attn_module.num_kv_heads, - head_size=attn_module.head_size, - dtype=self.kv_cache_dtype, - use_mla=use_mla)) else: raise ValueError("Expected only encoder-only layers") if len(attn_specs) > 0: - assert len(attn_specs) == len(attn_layers), \ + total_layers = 0 + for attn_spec, layer_names in attn_specs.items(): + + attn_backends = get_attn_backends_for_layers(layer_names) + total_layers += len(layer_names) + + self.attn_groups.append( + create_attn_groups(attn_backends, attn_spec)) + assert total_layers == len(attn_layers), \ "All or none of the layers are expected to be encoder-only" - - attn_backends = get_attn_backends_for_layers(attn_layers.keys()) - - self.attn_groups.append( - create_attn_groups(attn_backends, attn_specs[0])) self.is_encoder_only_model = True def calculate_reorder_batch_threshold(self) -> None: @@ -3071,7 +3085,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def _build_encoder_only_attn_metadata( self, scheduler_output: "SchedulerOutput") -> \ - tuple[CommonAttentionMetadata, Any]: + dict[str, tuple[CommonAttentionMetadata, Any]]: """Prepare encoder attention metadata for encoder-only models. Args: @@ -3088,10 +3102,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids] max_num_scheduled_tokens = max(tokens) - # Use the first attention metadata builder - # to create encoder attention metadata - builder = self.attn_groups[0][0].metadata_builder - dummy_block_table = torch.zeros((num_reqs, 1), dtype=torch.int32, device=self.device) @@ -3099,22 +3109,38 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): dtype=torch.int32, device=self.device) - common_metadata = CommonAttentionMetadata( - query_start_loc=self.query_start_loc[:num_reqs + 1], - query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1], - seq_lens=self.seq_lens[:num_reqs], - seq_lens_cpu=self.seq_lens_cpu[:num_reqs], - num_computed_tokens_cpu=self.input_batch. - num_computed_tokens_cpu_tensor[:num_reqs], - num_reqs=num_reqs, - num_actual_tokens=total_num_scheduled_tokens, - max_query_len=max_num_scheduled_tokens, - block_table_tensor=dummy_block_table, - slot_mapping=dummy_slot_mapping, - causal=False, - ) + group_metadata = dict[str, tuple[CommonAttentionMetadata, Any]]() - return common_metadata, builder.build( - common_prefix_len=0, # No cascade for encoder - common_attn_metadata=common_metadata, - ) + for attn_group_list in self.attn_groups: + + assert len(attn_group_list) == 1 + attn_group = attn_group_list[0] + + # Use the first attention metadata builder + # to create encoder attention metadata + builder = attn_group.metadata_builder + + common_metadata = CommonAttentionMetadata( + query_start_loc=self.query_start_loc[:num_reqs + 1], + query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1], + seq_lens=self.seq_lens[:num_reqs], + seq_lens_cpu=self.seq_lens_cpu[:num_reqs], + num_computed_tokens_cpu=self.input_batch. + num_computed_tokens_cpu_tensor[:num_reqs], + num_reqs=num_reqs, + num_actual_tokens=total_num_scheduled_tokens, + max_query_len=max_num_scheduled_tokens, + block_table_tensor=dummy_block_table, + slot_mapping=dummy_slot_mapping, + causal=False, + ) + + metadata = builder.build( + common_prefix_len=0, # No cascade for encoder + common_attn_metadata=common_metadata, + ) + + for layer_name in attn_group.layer_names: + group_metadata[layer_name] = (common_metadata, metadata) + + return group_metadata From 78077d5417aee128ac4fe92220476ea721ac27e4 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 12 Aug 2025 08:23:49 +0100 Subject: [PATCH 187/932] Move `SchedulerConfig` from `config/__init__.py` to `config/scheduler.py` (#22626) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/__init__.py | 316 +------------------------------------ vllm/config/scheduler.py | 329 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 331 insertions(+), 314 deletions(-) create mode 100644 vllm/config/scheduler.py diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 03ab034c62..159106003f 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -34,6 +34,7 @@ from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, from vllm.config.compilation import (CompilationConfig, CompilationLevel, PassConfig) from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig +from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy from vllm.config.utils import ConfigType, config from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationMethods @@ -47,15 +48,9 @@ from vllm.transformers_utils.config import ( try_get_tokenizer_config, uses_mrope) from vllm.transformers_utils.s3_utils import S3Model from vllm.transformers_utils.utils import is_s3, maybe_model_redirect -# yapf conflicts with isort for this block -# yapf: disable -from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, - MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, - POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, LayerBlockType, +from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, LayerBlockType, LazyLoader, common_broadcastable_dtype, random_uuid) -# yapf: enable - if TYPE_CHECKING: from _typeshed import DataclassInstance from transformers.configuration_utils import PretrainedConfig @@ -1820,313 +1815,6 @@ class LoadConfig: self.ignore_patterns = ["original/**/*"] -PreemptionMode = Literal["swap", "recompute"] -SchedulerPolicy = Literal["fcfs", "priority"] - - -@config -@dataclass -class SchedulerConfig: - """Scheduler configuration.""" - - runner_type: RunnerType = "generate" - """The runner type to launch for the model.""" - - max_num_batched_tokens: SkipValidation[int] = None # type: ignore - """Maximum number of tokens to be processed in a single iteration. - - This config has no static default. If left unspecified by the user, it will - be set in `EngineArgs.create_engine_config` based on the usage context.""" - - max_num_seqs: SkipValidation[int] = None # type: ignore - """Maximum number of sequences to be processed in a single iteration. - - This config has no static default. If left unspecified by the user, it will - be set in `EngineArgs.create_engine_config` based on the usage context.""" - - max_model_len: SkipValidation[int] = None # type: ignore - """Maximum length of a sequence (including prompt and generated text). This - is primarily set in `ModelConfig` and that value should be manually - duplicated here.""" - - max_num_partial_prefills: int = 1 - """For chunked prefill, the maximum number of sequences that can be - partially prefilled concurrently.""" - - max_long_partial_prefills: int = 1 - """For chunked prefill, the maximum number of prompts longer than - long_prefill_token_threshold that will be prefilled concurrently. Setting - this less than max_num_partial_prefills will allow shorter prompts to jump - the queue in front of longer prompts in some cases, improving latency.""" - - long_prefill_token_threshold: int = 0 - """For chunked prefill, a request is considered long if the prompt is - longer than this number of tokens.""" - - num_lookahead_slots: int = 0 - """The number of slots to allocate per sequence per - step, beyond the known token ids. This is used in speculative - decoding to store KV activations of tokens which may or may not be - accepted. - - NOTE: This will be replaced by speculative config in the future; it is - present to enable correctness tests until then.""" - - cuda_graph_sizes: list[int] = field(default_factory=list) - """Cuda graph capture sizes - 1. if none provided, then default set to [min(max_num_seqs * 2, 512)] - 2. if one value is provided, then the capture list would follow the - pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)] - 3. more than one value (e.g. 1 2 128) is provided, then the capture list - will follow the provided list.""" - - delay_factor: float = 0.0 - """Apply a delay (of delay factor multiplied by previous - prompt latency) before scheduling next prompt.""" - - enable_chunked_prefill: SkipValidation[bool] = None # type: ignore - """If True, prefill requests can be chunked based - on the remaining max_num_batched_tokens.""" - - is_multimodal_model: bool = False - """True if the model is multimodal.""" - - # TODO (ywang96): Make this configurable. - max_num_encoder_input_tokens: int = field(init=False) - """Multimodal encoder compute budget, only used in V1. - - NOTE: This is not currently configurable. It will be overridden by - max_num_batched_tokens in case max multimodal embedding size is larger.""" - - # TODO (ywang96): Make this configurable. - encoder_cache_size: int = field(init=False) - """Multimodal encoder cache size, only used in V1. - - NOTE: This is not currently configurable. It will be overridden by - max_num_batched_tokens in case max multimodal embedding size is larger.""" - - preemption_mode: Optional[PreemptionMode] = None - """Whether to perform preemption by swapping or - recomputation. If not specified, we determine the mode as follows: - We use recomputation by default since it incurs lower overhead than - swapping. However, when the sequence group has multiple sequences - (e.g., beam search), recomputation is not currently supported. In - such a case, we use swapping instead.""" - - num_scheduler_steps: int = 1 - """Maximum number of forward steps per scheduler call.""" - - multi_step_stream_outputs: bool = True - """If False, then multi-step will stream outputs at the end of all steps""" - - send_delta_data: bool = False - """Private API. If used, scheduler sends delta data to - workers instead of an entire data. It should be enabled only - when SPMD worker architecture is enabled. I.e., - VLLM_USE_RAY_SPMD_WORKER=1""" - - policy: SchedulerPolicy = "fcfs" - """The scheduling policy to use:\n - - "fcfs" means first come first served, i.e. requests are handled in order - of arrival.\n - - "priority" means requests are handled based on given priority (lower - value means earlier handling) and time of arrival deciding any ties).""" - - chunked_prefill_enabled: bool = field(init=False) - """True if chunked prefill is enabled.""" - - disable_chunked_mm_input: bool = False - """If set to true and chunked prefill is enabled, we do not want to - partially schedule a multimodal item. Only used in V1 - This ensures that if a request has a mixed prompt - (like text tokens TTTT followed by image tokens IIIIIIIIII) where only - some image tokens can be scheduled (like TTTTIIIII, leaving IIIII), - it will be scheduled as TTTT in one step and IIIIIIIIII in the next.""" - - # scheduler class or path. "vllm.core.scheduler.Scheduler" (default) - # or "mod.custom_class". - scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler" - """The scheduler class to use. "vllm.core.scheduler.Scheduler" is the - default scheduler. Can be a class directly or the path to a class of form - "mod.custom_class".""" - - disable_hybrid_kv_cache_manager: bool = False - """If set to True, KV cache manager will allocate the same size of KV cache - for all attention layers even if there are multiple type of attention layers - like full attention and sliding window attention. - """ - - async_scheduling: bool = False - """EXPERIMENTAL: If set to True, perform async scheduling. This may help - reduce the CPU overheads, leading to better latency and throughput. However, - async scheduling is currently not supported with some features such as - structured outputs, speculative decoding, and pipeline parallelism. - """ - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - # no factors to consider. - # this config will not affect the computation graph. - factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() - return hash_str - - def __post_init__(self) -> None: - if self.max_model_len is None: - self.max_model_len = 8192 - - if self.max_num_seqs is None: - self.max_num_seqs = 128 - - if self.max_num_batched_tokens is None: - if self.enable_chunked_prefill: - if self.num_scheduler_steps > 1: - # Multi-step Chunked-Prefill doesn't allow prompt-chunking - # for now. Have max_num_batched_tokens set to max_model_len - # so we don't reject sequences on account of a short - # max_num_batched_tokens. - self.max_num_batched_tokens = max( - self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) - else: - self.max_num_batched_tokens = ( - DEFAULT_MAX_NUM_BATCHED_TOKENS) - else: - # If max_model_len is too short, use - # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value - # for higher throughput. - self.max_num_batched_tokens = max( - self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) - - if self.runner_type == "pooling": - # Choose specific value for higher throughput - self.max_num_batched_tokens = max( - self.max_num_batched_tokens, - POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, - ) - if self.is_multimodal_model: - # The value needs to be at least the number of multimodal tokens - self.max_num_batched_tokens = max( - self.max_num_batched_tokens, - MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, - ) - - # When using default settings, - # Ensure max_num_batched_tokens does not exceed model limit. - # Some models (e.g., Whisper) have embeddings tied to max length. - self.max_num_batched_tokens = min( - self.max_num_seqs * self.max_model_len, - self.max_num_batched_tokens) - - self.max_num_encoder_input_tokens = self.max_num_batched_tokens - self.encoder_cache_size = self.max_num_batched_tokens - - if self.enable_chunked_prefill: - logger.info( - "Chunked prefill is enabled with max_num_batched_tokens=%d.", - self.max_num_batched_tokens) - - self.chunked_prefill_enabled = self.enable_chunked_prefill - if self.max_num_partial_prefills > 1: - if self.long_prefill_token_threshold == 0: - self.long_prefill_token_threshold = int(self.max_model_len * - 0.04) - - logger.info( - "Concurrent partial prefills enabled with " - "max_num_partial_prefills=%d, max_long_partial_prefills=%d, " - "long_prefill_token_threshold=%d", - self.max_num_partial_prefills, self.max_long_partial_prefills, - self.long_prefill_token_threshold) - - # NOTE: Default set cuda_graph_sizes to [min(max_num_seqs * 2, 512)]. - # This avoids OOM in tight memory scenarios with small max_num_seqs, - # and prevents capture of many large graphs (>512) that would greatly - # increase startup time with limited performance benefit. - if not self.cuda_graph_sizes: - self.cuda_graph_sizes = [min(self.max_num_seqs * 2, 512)] - - if self.async_scheduling: - self.scheduler_cls = ( - "vllm.v1.core.sched.async_scheduler.AsyncScheduler") - - @model_validator(mode='after') - def _verify_args(self) -> Self: - if (self.max_num_batched_tokens < self.max_model_len - and not self.chunked_prefill_enabled): - raise ValueError( - f"max_num_batched_tokens ({self.max_num_batched_tokens}) is " - f"smaller than max_model_len ({self.max_model_len}). " - "This effectively limits the maximum sequence length to " - "max_num_batched_tokens and makes vLLM reject longer " - "sequences. Please increase max_num_batched_tokens or " - "decrease max_model_len.") - - if self.max_num_batched_tokens < self.max_num_seqs: - raise ValueError( - f"max_num_batched_tokens ({self.max_num_batched_tokens}) must " - "be greater than or equal to max_num_seqs " - f"({self.max_num_seqs}).") - - if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len: - logger.warning( - "max_num_batched_tokens (%d) exceeds max_num_seqs " - "* max_model_len (%d). This may lead to unexpected behavior.", - self.max_num_batched_tokens, - self.max_num_seqs * self.max_model_len) - - if self.num_lookahead_slots < 0: - raise ValueError( - "num_lookahead_slots " - f"({self.num_lookahead_slots}) must be greater than or " - "equal to 0.") - - if self.num_scheduler_steps < 1: - raise ValueError( - "num_scheduler_steps " - f"({self.num_scheduler_steps}) must be greater than or " - "equal to 1.") - - if self.max_num_partial_prefills < 1: - raise ValueError( - f"max_num_partial_prefills ({self.max_num_partial_prefills}) " - "must be greater than or equal to 1.") - elif self.max_num_partial_prefills > 1: - if not self.chunked_prefill_enabled: - raise ValueError("Chunked prefill must be enabled to set " - "max_num_partial_prefills > 1.") - - if self.long_prefill_token_threshold > self.max_model_len: - raise ValueError( - "long_prefill_token_threshold " - f"({self.long_prefill_token_threshold}) cannot be greater " - f"than the max_model_len ({self.max_model_len}).") - - if (self.max_long_partial_prefills - < 1) or (self.max_long_partial_prefills - > self.max_num_partial_prefills): - raise ValueError( - f"max_long_partial_prefills ({self.max_long_partial_prefills}) " - "must be greater than or equal to 1 and less than or equal to " - f"max_num_partial_prefills ({self.max_num_partial_prefills}).") - - return self - - @property - def is_multi_step(self) -> bool: - return self.num_scheduler_steps > 1 - - Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"] diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py new file mode 100644 index 0000000000..db669600a0 --- /dev/null +++ b/vllm/config/scheduler.py @@ -0,0 +1,329 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from dataclasses import field +from typing import TYPE_CHECKING, Any, Literal, Optional, Union + +from pydantic import SkipValidation, model_validator +from pydantic.dataclasses import dataclass +from typing_extensions import Self + +from vllm.config.utils import config +from vllm.logger import init_logger +from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, + MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, + POOLING_MODEL_MAX_NUM_BATCHED_TOKENS) + +if TYPE_CHECKING: + from vllm.config import RunnerType +else: + RunnerType = Any + +logger = init_logger(__name__) + +PreemptionMode = Literal["swap", "recompute"] +SchedulerPolicy = Literal["fcfs", "priority"] + + +@config +@dataclass +class SchedulerConfig: + """Scheduler configuration.""" + + runner_type: RunnerType = "generate" + """The runner type to launch for the model.""" + + max_num_batched_tokens: SkipValidation[int] = None # type: ignore + """Maximum number of tokens to be processed in a single iteration. + + This config has no static default. If left unspecified by the user, it will + be set in `EngineArgs.create_engine_config` based on the usage context.""" + + max_num_seqs: SkipValidation[int] = None # type: ignore + """Maximum number of sequences to be processed in a single iteration. + + This config has no static default. If left unspecified by the user, it will + be set in `EngineArgs.create_engine_config` based on the usage context.""" + + max_model_len: SkipValidation[int] = None # type: ignore + """Maximum length of a sequence (including prompt and generated text). This + is primarily set in `ModelConfig` and that value should be manually + duplicated here.""" + + max_num_partial_prefills: int = 1 + """For chunked prefill, the maximum number of sequences that can be + partially prefilled concurrently.""" + + max_long_partial_prefills: int = 1 + """For chunked prefill, the maximum number of prompts longer than + long_prefill_token_threshold that will be prefilled concurrently. Setting + this less than max_num_partial_prefills will allow shorter prompts to jump + the queue in front of longer prompts in some cases, improving latency.""" + + long_prefill_token_threshold: int = 0 + """For chunked prefill, a request is considered long if the prompt is + longer than this number of tokens.""" + + num_lookahead_slots: int = 0 + """The number of slots to allocate per sequence per + step, beyond the known token ids. This is used in speculative + decoding to store KV activations of tokens which may or may not be + accepted. + + NOTE: This will be replaced by speculative config in the future; it is + present to enable correctness tests until then.""" + + cuda_graph_sizes: list[int] = field(default_factory=list) + """Cuda graph capture sizes + 1. if none provided, then default set to [min(max_num_seqs * 2, 512)] + 2. if one value is provided, then the capture list would follow the + pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)] + 3. more than one value (e.g. 1 2 128) is provided, then the capture list + will follow the provided list.""" + + delay_factor: float = 0.0 + """Apply a delay (of delay factor multiplied by previous + prompt latency) before scheduling next prompt.""" + + enable_chunked_prefill: SkipValidation[bool] = None # type: ignore + """If True, prefill requests can be chunked based + on the remaining max_num_batched_tokens.""" + + is_multimodal_model: bool = False + """True if the model is multimodal.""" + + # TODO (ywang96): Make this configurable. + max_num_encoder_input_tokens: int = field(init=False) + """Multimodal encoder compute budget, only used in V1. + + NOTE: This is not currently configurable. It will be overridden by + max_num_batched_tokens in case max multimodal embedding size is larger.""" + + # TODO (ywang96): Make this configurable. + encoder_cache_size: int = field(init=False) + """Multimodal encoder cache size, only used in V1. + + NOTE: This is not currently configurable. It will be overridden by + max_num_batched_tokens in case max multimodal embedding size is larger.""" + + preemption_mode: Optional[PreemptionMode] = None + """Whether to perform preemption by swapping or + recomputation. If not specified, we determine the mode as follows: + We use recomputation by default since it incurs lower overhead than + swapping. However, when the sequence group has multiple sequences + (e.g., beam search), recomputation is not currently supported. In + such a case, we use swapping instead.""" + + num_scheduler_steps: int = 1 + """Maximum number of forward steps per scheduler call.""" + + multi_step_stream_outputs: bool = True + """If False, then multi-step will stream outputs at the end of all steps""" + + send_delta_data: bool = False + """Private API. If used, scheduler sends delta data to + workers instead of an entire data. It should be enabled only + when SPMD worker architecture is enabled. I.e., + VLLM_USE_RAY_SPMD_WORKER=1""" + + policy: SchedulerPolicy = "fcfs" + """The scheduling policy to use:\n + - "fcfs" means first come first served, i.e. requests are handled in order + of arrival.\n + - "priority" means requests are handled based on given priority (lower + value means earlier handling) and time of arrival deciding any ties).""" + + chunked_prefill_enabled: bool = field(init=False) + """True if chunked prefill is enabled.""" + + disable_chunked_mm_input: bool = False + """If set to true and chunked prefill is enabled, we do not want to + partially schedule a multimodal item. Only used in V1 + This ensures that if a request has a mixed prompt + (like text tokens TTTT followed by image tokens IIIIIIIIII) where only + some image tokens can be scheduled (like TTTTIIIII, leaving IIIII), + it will be scheduled as TTTT in one step and IIIIIIIIII in the next.""" + + # scheduler class or path. "vllm.core.scheduler.Scheduler" (default) + # or "mod.custom_class". + scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler" + """The scheduler class to use. "vllm.core.scheduler.Scheduler" is the + default scheduler. Can be a class directly or the path to a class of form + "mod.custom_class".""" + + disable_hybrid_kv_cache_manager: bool = False + """If set to True, KV cache manager will allocate the same size of KV cache + for all attention layers even if there are multiple type of attention layers + like full attention and sliding window attention. + """ + + async_scheduling: bool = False + """EXPERIMENTAL: If set to True, perform async scheduling. This may help + reduce the CPU overheads, leading to better latency and throughput. However, + async scheduling is currently not supported with some features such as + structured outputs, speculative decoding, and pipeline parallelism. + """ + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), + usedforsecurity=False).hexdigest() + return hash_str + + def __post_init__(self) -> None: + if self.max_model_len is None: + self.max_model_len = 8192 + + if self.max_num_seqs is None: + self.max_num_seqs = 128 + + if self.max_num_batched_tokens is None: + if self.enable_chunked_prefill: + if self.num_scheduler_steps > 1: + # Multi-step Chunked-Prefill doesn't allow prompt-chunking + # for now. Have max_num_batched_tokens set to max_model_len + # so we don't reject sequences on account of a short + # max_num_batched_tokens. + self.max_num_batched_tokens = max( + self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) + else: + self.max_num_batched_tokens = ( + DEFAULT_MAX_NUM_BATCHED_TOKENS) + else: + # If max_model_len is too short, use + # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value + # for higher throughput. + self.max_num_batched_tokens = max( + self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) + + if self.runner_type == "pooling": + # Choose specific value for higher throughput + self.max_num_batched_tokens = max( + self.max_num_batched_tokens, + POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, + ) + if self.is_multimodal_model: + # The value needs to be at least the number of multimodal tokens + self.max_num_batched_tokens = max( + self.max_num_batched_tokens, + MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, + ) + + # When using default settings, + # Ensure max_num_batched_tokens does not exceed model limit. + # Some models (e.g., Whisper) have embeddings tied to max length. + self.max_num_batched_tokens = min( + self.max_num_seqs * self.max_model_len, + self.max_num_batched_tokens) + + self.max_num_encoder_input_tokens = self.max_num_batched_tokens + self.encoder_cache_size = self.max_num_batched_tokens + + if self.enable_chunked_prefill: + logger.info( + "Chunked prefill is enabled with max_num_batched_tokens=%d.", + self.max_num_batched_tokens) + + self.chunked_prefill_enabled = self.enable_chunked_prefill + if self.max_num_partial_prefills > 1: + if self.long_prefill_token_threshold == 0: + self.long_prefill_token_threshold = int(self.max_model_len * + 0.04) + + logger.info( + "Concurrent partial prefills enabled with " + "max_num_partial_prefills=%d, max_long_partial_prefills=%d, " + "long_prefill_token_threshold=%d", + self.max_num_partial_prefills, self.max_long_partial_prefills, + self.long_prefill_token_threshold) + + # NOTE: Default set cuda_graph_sizes to [min(max_num_seqs * 2, 512)]. + # This avoids OOM in tight memory scenarios with small max_num_seqs, + # and prevents capture of many large graphs (>512) that would greatly + # increase startup time with limited performance benefit. + if not self.cuda_graph_sizes: + self.cuda_graph_sizes = [min(self.max_num_seqs * 2, 512)] + + if self.async_scheduling: + self.scheduler_cls = ( + "vllm.v1.core.sched.async_scheduler.AsyncScheduler") + + @model_validator(mode='after') + def _verify_args(self) -> Self: + if (self.max_num_batched_tokens < self.max_model_len + and not self.chunked_prefill_enabled): + raise ValueError( + f"max_num_batched_tokens ({self.max_num_batched_tokens}) is " + f"smaller than max_model_len ({self.max_model_len}). " + "This effectively limits the maximum sequence length to " + "max_num_batched_tokens and makes vLLM reject longer " + "sequences. Please increase max_num_batched_tokens or " + "decrease max_model_len.") + + if self.max_num_batched_tokens < self.max_num_seqs: + raise ValueError( + f"max_num_batched_tokens ({self.max_num_batched_tokens}) must " + "be greater than or equal to max_num_seqs " + f"({self.max_num_seqs}).") + + if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len: + logger.warning( + "max_num_batched_tokens (%d) exceeds max_num_seqs " + "* max_model_len (%d). This may lead to unexpected behavior.", + self.max_num_batched_tokens, + self.max_num_seqs * self.max_model_len) + + if self.num_lookahead_slots < 0: + raise ValueError( + "num_lookahead_slots " + f"({self.num_lookahead_slots}) must be greater than or " + "equal to 0.") + + if self.num_scheduler_steps < 1: + raise ValueError( + "num_scheduler_steps " + f"({self.num_scheduler_steps}) must be greater than or " + "equal to 1.") + + if self.max_num_partial_prefills < 1: + raise ValueError( + f"max_num_partial_prefills ({self.max_num_partial_prefills}) " + "must be greater than or equal to 1.") + elif self.max_num_partial_prefills > 1: + if not self.chunked_prefill_enabled: + raise ValueError("Chunked prefill must be enabled to set " + "max_num_partial_prefills > 1.") + + if self.long_prefill_token_threshold > self.max_model_len: + raise ValueError( + "long_prefill_token_threshold " + f"({self.long_prefill_token_threshold}) cannot be greater " + f"than the max_model_len ({self.max_model_len}).") + + if (self.max_long_partial_prefills + < 1) or (self.max_long_partial_prefills + > self.max_num_partial_prefills): + raise ValueError( + f"max_long_partial_prefills ({self.max_long_partial_prefills}) " + "must be greater than or equal to 1 and less than or equal to " + f"max_num_partial_prefills ({self.max_num_partial_prefills}).") + + return self + + @property + def is_multi_step(self) -> bool: + return self.num_scheduler_steps > 1 From 59f3b936365afd200e474ddc9d1f5aa33f05b634 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Tue, 12 Aug 2025 03:22:49 -0500 Subject: [PATCH 188/932] [DOC] update v1_guide with INTEL HW (#22679) Signed-off-by: Chendi.Xue --- docs/usage/v1_guide.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 12191d3490..54af970ea8 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -63,6 +63,7 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the |------------|-----------------------------------------------| | **NVIDIA** | 🚀 | | **AMD** | 🟢 | +| **INTEL GPU** | 🟢 | | **TPU** | 🟢 | | **CPU** | 🟢 (x86\_64/aarch64) 🟡 (MacOS) | @@ -72,6 +73,7 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the - [vllm-ascend](https://github.com/vllm-project/vllm-ascend) - [vllm-spyre](https://github.com/vllm-project/vllm-spyre) + - [vllm-gaudi](https://github.com/vllm-project/vllm-gaudi) - [vllm-openvino](https://github.com/vllm-project/vllm-openvino) Please check their corresponding repositories for more details. From 9f909b89963aa71b06b490a78ac9905d11879454 Mon Sep 17 00:00:00 2001 From: dongluw <108290936+dongluw@users.noreply.github.com> Date: Tue, 12 Aug 2025 04:39:54 -0400 Subject: [PATCH 189/932] [New Model] Support Command-A-Vision (#22660) Signed-off-by: donglu --- docs/models/supported_models.md | 3 +- examples/offline_inference/vision_language.py | 24 + .../vision_language_multi_image.py | 37 ++ tests/models/registry.py | 1 + vllm/model_executor/models/cohere2_vision.py | 445 ++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 6 files changed, 510 insertions(+), 1 deletion(-) create mode 100644 vllm/model_executor/models/cohere2_vision.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index ddab7ad5d9..ea36331542 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -331,7 +331,7 @@ th { | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | | | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | | | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ | | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ | | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ | | `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ | @@ -601,6 +601,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `AyaVisionForConditionalGeneration` | Aya Vision | T + I+ | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ | | `Blip2ForConditionalGeneration` | BLIP-2 | T + IE | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ | | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | ✅︎ | +| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I+ | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ | | `DeepseekVLV2ForCausalLM`^ | DeepSeek-VL2 | T + I+ | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ | | `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | | | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 5b3f0d2dc2..988ad35cdd 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -126,6 +126,29 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData: ) +def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "CohereLabs/command-a-vision-07-2025" + + engine_args = EngineArgs( + model=model_name, + max_model_len=32768, + tensor_parallel_size=4, + limit_mm_per_prompt={modality: 1}, + ) + + prompts = [ + f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><|IMG_PATCH|>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # Deepseek-VL2 def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1417,6 +1440,7 @@ model_example_map = { "aya_vision": run_aya_vision, "blip-2": run_blip2, "chameleon": run_chameleon, + "command_a_vision": run_command_a_vision, "deepseek_vl_v2": run_deepseek_vl2, "florence2": run_florence2, "fuyu": run_fuyu, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 1ab405fa14..799337ed68 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -107,6 +107,42 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_command_a_vision(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "CohereLabs/command-a-vision-07-2025" + + # NOTE: This model is 122B parameters and requires tensor parallelism + # Recommended to use tp=4 on H100 GPUs + engine_args = EngineArgs( + model=model_name, + max_model_len=32768, + tensor_parallel_size=4, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] + + processor = AutoProcessor.from_pretrained(model_name) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "deepseek-ai/deepseek-vl2-tiny" @@ -1031,6 +1067,7 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData: model_example_map = { "aria": load_aria, "aya_vision": load_aya_vision, + "command_a_vision": load_command_a_vision, "deepseek_vl_v2": load_deepseek_vl2, "gemma3": load_gemma3, "h2ovl_chat": load_h2ovl, diff --git a/tests/models/registry.py b/tests/models/registry.py index c5816df25b..eae5829030 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -383,6 +383,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b", # noqa: E501 extras={"6b": "Salesforce/blip2-opt-6.7b"}), # noqa: E501 "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501 + "Cohere2VisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/command-a-vision-07-2025"), # noqa: E501 "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501 extras={"fork": "Isotr0py/deepseek-vl2-tiny"}, # noqa: E501 max_transformers_version="4.48", # noqa: E501 diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py new file mode 100644 index 0000000000..f17583768f --- /dev/null +++ b/vllm/model_executor/models/cohere2_vision.py @@ -0,0 +1,445 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Adapted from vllm/model_executor/models/aya_vision.py +"""Command-A-Vision (Cohere2Vision) multimodal model implementation for vLLM.""" + +from collections.abc import Iterable, Mapping, Sequence +from typing import Annotated, Literal, Optional, Union + +import torch +from torch import nn +from transformers import BatchFeature, PretrainedConfig +from transformers.models.cohere2_vision import Cohere2VisionConfig +from transformers.models.cohere2_vision.processing_cohere2_vision import ( + Cohere2VisionProcessor) + +from vllm.config import VllmConfig +from vllm.model_executor.layers.activation import MulAndSilu +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.awq import AWQConfig +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs +from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, + MultiModalDataItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, + MultiModalFieldConfig, + PromptReplacement, PromptUpdate, + PromptUpdateDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors +from vllm.utils.tensor_schema import TensorSchema, TensorShape + +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .siglip import SiglipVisionModel +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings) + + +class Cohere2VisionImagePixelInputs(TensorSchema): + """ + Dimensions: + - np: The total number of patches over each image over each prompt in + the batch + - c: Number of channels + - h: Height of each image patch + - w: Width of each image patch + - bn: Batch size * number of images + """ + + type: Literal["pixel_values"] + + pixel_values: Annotated[ + torch.Tensor, + TensorShape("np", 3, "h", "w"), + ] + + num_patches: Annotated[ + torch.Tensor, + TensorShape("bn"), + ] + + +class Cohere2VisionMultiModalProjector(nn.Module): + """Multimodal projector that maps vision features to text embedding space. + + Uses pixel shuffle downsampling followed by SwiGLU activation. + """ + + def __init__(self, config: Cohere2VisionConfig, prefix: str = ""): + super().__init__() + self.downsample_factor = config.downsample_factor + + # Input dimension after pixel shuffle downsampling + input_dim = config.vision_config.hidden_size * ( + config.downsample_factor**2) + # MergedColumnParallelLinear expects the intermediate size to be a list + # of sizes, so that it will load the weights as two separate linear + # layers before applying any parallelism. + # We need to divide the alignment intermediate size by 2 because + # the weights are merged weights of two linear layers for SwiGLU. + self.intermediate_size = config.alignment_intermediate_size // 2 + + self.linear_1 = MergedColumnParallelLinear( + input_dim, + [self.intermediate_size] * 2, + bias=True, + return_bias=False, + prefix=f"{prefix}.linear_1", + ) + self.act = MulAndSilu() + self.linear_2 = RowParallelLinear( + self.intermediate_size, + config.text_config.hidden_size, + bias=True, + return_bias=False, + prefix=f"{prefix}.linear_2", + ) + + def forward(self, image_features): + image_features = self.pixel_shuffle(image_features) + hidden_states = self.linear_1(image_features) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + return hidden_states + + def pixel_shuffle(self, image_features: torch.Tensor) -> torch.Tensor: + """Apply pixel shuffle downsampling to reduce spatial dimensions. + + Args: + image_features: Input tensor of shape [B, S, D] where S = H*W + + Returns: + Downsampled tensor with increased channel dimension + """ + height = width = int(image_features.shape[1]**0.5) + x = image_features.reshape(image_features.shape[0], width, height, -1) + n, h, w, c = x.size() + scale_factor = 1. / self.downsample_factor + nh = int(h * scale_factor) + nw = int(w * scale_factor) + x = x.reshape(n, nh, self.downsample_factor, nw, + self.downsample_factor, c) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous() + x = x.reshape(n, nh, nw, -1) + return x + + +class Cohere2VisionProcessingInfo(BaseProcessingInfo): + + def get_hf_config(self) -> Cohere2VisionConfig: + return self.ctx.get_hf_config(Cohere2VisionConfig) + + def get_hf_processor(self, **kwargs: object) -> Cohere2VisionProcessor: + return self.ctx.get_hf_processor(Cohere2VisionProcessor, **kwargs) + + def get_image_processor(self, **kwargs: object): + return self.get_hf_processor(**kwargs).image_processor + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_image_size_with_most_features(self) -> ImageSize: + image_processor = self.get_image_processor() + height = image_processor.size['height'] + width = image_processor.size['width'] + max_patches = image_processor.max_patches + return ImageSize(height=height * max_patches, width=width) + + def get_num_patches(self, image_width: int, image_height: int) -> int: + """ + Calculate the number of image patches for a given image. + Uses the HF processor to determine the actual number of patches. + """ + return self.get_hf_processor( + ).image_processor.get_number_of_image_patches(image_height, + image_width, {}) + + +class Cohere2VisionDummyInputsBuilder( + BaseDummyInputsBuilder[Cohere2VisionProcessingInfo]): + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + + processor = self.info.get_hf_processor() + image_token = processor.image_token + + return image_token * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + image_size = \ + self.info.get_image_size_with_most_features() + + return { + "image": + self._get_dummy_images(width=image_size.width, + height=image_size.height, + num_images=num_images) + } + + +class Cohere2VisionMultiModalProcessor( + BaseMultiModalProcessor[Cohere2VisionProcessingInfo]): + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + processed_outputs = super()._call_hf_processor( + prompt, + mm_data, + mm_kwargs, + tok_kwargs, + ) + + # Ensure num_patches is available for proper tensor splitting + if "num_patches" not in processed_outputs and ( + images := mm_data.get("images")) is not None: + # Fallback calculation if HF processor didn't provide num_patches + parsed_images = self._get_data_parser().parse_mm_data({ + "image": + images + }).get_items("image", ImageProcessorItems) + + num_patches = [ + self.info.get_num_patches( + image_width=parsed_images.get_image_size(i).width, + image_height=parsed_images.get_image_size(i).height) + for i in range(len(parsed_images)) + ] + processed_outputs["num_patches"] = torch.tensor(num_patches) + + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + num_patches = hf_inputs.get("num_patches", torch.empty(0)) + return dict( + pixel_values=MultiModalFieldConfig.flat_from_sizes( + "image", num_patches), + num_patches=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + image_token = hf_processor.image_token + img_line_break_token = hf_processor.img_line_break_token + boi_token = hf_processor.boi_token + eoi_token = hf_processor.eoi_token + + def get_replacement(item_idx: int): + images: ImageProcessorItems = mm_items.get("image", + ImageProcessorItems) + image_size: ImageSize = images.get_image_size(item_idx) + + num_patches = self.info.get_num_patches(image_size.height, + image_size.width) + img_tokens_per_tile = int(hf_processor.patch_size**2) + single_tile_tokens = image_token * img_tokens_per_tile + \ + img_line_break_token + img_string = f"{boi_token}\ + {single_tile_tokens * num_patches}\ + {eoi_token}" + + return PromptUpdateDetails.select_text(img_string, image_token) + + return [ + PromptReplacement( + modality="image", + target=image_token, + replacement=get_replacement, + ) + ] + + +@MULTIMODAL_REGISTRY.register_processor( + Cohere2VisionMultiModalProcessor, + info=Cohere2VisionProcessingInfo, + dummy_inputs=Cohere2VisionDummyInputsBuilder) +class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsPP): + + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "model.vision_tower.": "vision_tower.", + "model.multi_modal_projector.": "multi_modal_projector.", + "model.language_model.": "language_model.model.", + "lm_head.": "language_model.lm_head.", + }) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config: Cohere2VisionConfig = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + self.config = config + self.quant_config = quant_config + self.multimodal_config = multimodal_config + self._patch_quant_config(config, quant_config) + + self.vision_tower = SiglipVisionModel(config.vision_config, + quant_config, + prefix=maybe_prefix( + prefix, "vision_tower")) + self.vocab_size = config.text_config.vocab_size + self.multi_modal_projector = \ + Cohere2VisionMultiModalProjector( + config, prefix=maybe_prefix(prefix, "multi_modal_projector")) + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + architectures=["Cohere2ForCausalLM"]) + + @property + def dtype(self): + return next(self.parameters()).dtype + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + def _process_image_input(self, image_input: Cohere2VisionImagePixelInputs, + **kwargs) -> list[torch.Tensor]: + """Process image pixels through vision tower and projector. + + Args: + image_input: Validated image input containing pixel values and + patch counts + + Returns: + List of flattened image embeddings, one per image + """ + assert self.vision_tower is not None, "Vision tower is required" + + pixel_values = image_input["pixel_values"] + num_patches = image_input["num_patches"] + + # Extract visual features + image_features = self.vision_tower(pixel_values) + + # Project to text embedding space + image_embeds = self.multi_modal_projector(image_features) + + # Split and flatten embeddings per image + return [ + e.flatten(0, 2) for e in image_embeds.split(num_patches.tolist()) + ] + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[Cohere2VisionImagePixelInputs]: + pixel_values = kwargs.pop("pixel_values", None) + num_patches = kwargs.pop("num_patches", None) + image_embeds = kwargs.pop("image_embeds", None) + assert image_embeds is None, \ + "Cohere2Vision does not support image_embeds." + + if pixel_values is None: + return None + + return Cohere2VisionImagePixelInputs( + type="pixel_values", + pixel_values=flatten_bn(pixel_values, concat=True), + num_patches=flatten_bn(num_patches, concat=True), + resolve_bindings={ + "h": self.config.vision_config.image_size, + "w": self.config.vision_config.image_size, + }) + + def _patch_quant_config(self, config: PretrainedConfig, + quant_config: QuantizationConfig): + # the awq models from OpenGVLab missing `modules_to_not_convert` + # patch the quant_config to add `modules_to_not_convert` back + if isinstance(quant_config, AWQConfig): + text_config = config.text_config + llm_quant_config = getattr(text_config, "quantization_config", + None) + if (not quant_config.modules_to_not_convert) and (llm_quant_config + is not None): + quant_config.modules_to_not_convert.append("vision_tower") + + def get_language_model(self) -> torch.nn.Module: + return self.language_model + + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return [] + + return self._process_image_input(image_input, **kwargs) + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: + inputs_embeds = merge_multimodal_embeddings( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + placeholder_token_id=self.config.image_token_id, + ) + + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: + inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None + + hidden_states = self.language_model.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 870704c64d..279e045a70 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -201,6 +201,7 @@ _MULTIMODAL_MODELS = { "AyaVisionForConditionalGeneration": ("aya_vision", "AyaVisionForConditionalGeneration"), # noqa: E501 "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"), "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501 + "Cohere2VisionForConditionalGeneration": ("cohere2_vision", "Cohere2VisionForConditionalGeneration"), # noqa: E501 "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"), "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501 From 8d17fa633e319c4a585f6ae1258000a40750e127 Mon Sep 17 00:00:00 2001 From: Sugar-zsg <64777228+Sugar-zsg@users.noreply.github.com> Date: Tue, 12 Aug 2025 17:01:08 +0800 Subject: [PATCH 190/932] [V0] Correct CUDA Graph capture for encoder-decoder models (#22630) --- vllm/config/__init__.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 159106003f..df4eb33f5d 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1164,8 +1164,18 @@ class ModelConfig: "non-quantized models.", self.quantization) def _verify_cuda_graph(self) -> None: + # The `max_seq_len_to_capture` was incorrectly + # based on the encoder's input length (448) + # but not the decoder's larger input length (1500). + # This change ensures the CUDA Graph captures the correct, + # larger sequence length, allowing it to work as intended. + effective_max_seq_len = self.max_model_len + if self.is_encoder_decoder: + effective_max_seq_len = max( + effective_max_seq_len, + getattr(self.hf_config, "max_source_positions", 0)) self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, - self.max_model_len) + effective_max_seq_len) # CUDAGraph capture not supported for enc-dec models and mllama on ROCm ROCM_UNSUPPORTED_MODELS = ['mllama'] unsupported_rocm = (self.hf_config.model_type From bc8372efc318d404db4b40a6ef86c3452f5f2a46 Mon Sep 17 00:00:00 2001 From: phantomlei Date: Tue, 12 Aug 2025 17:03:22 +0800 Subject: [PATCH 191/932] [Bugfix] Fix erroneous randomly generated cases in bad word testing (#22170) Signed-off-by: phantomlei --- tests/v1/sample/test_sampler.py | 34 +++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py index ea10661ea1..31c6c881d7 100644 --- a/tests/v1/sample/test_sampler.py +++ b/tests/v1/sample/test_sampler.py @@ -90,6 +90,27 @@ def _create_bad_words_token_ids( return bad_words_token_ids +# Returns all last tokens of bad word sequences that share the same prefix +# as `given_prefix` (excluding the last token). +def _collect_suffixes_with_same_prefix( + given_prefix: list[int], + bad_words_token_ids: list[list[int]]) -> list[int]: + return [bwt[-1] for bwt in bad_words_token_ids if bwt[:-1] == given_prefix] + + +# generate a valid token id that is not in bad_words_token_ids +def _generate_valid_token_id(bad_words_token_ids: list[list[int]], + vocab_size: int) -> int: + forbidden_start_tokens = set() + for bad_word in bad_words_token_ids: + forbidden_start_tokens.add(bad_word[0]) + # Get a safe token that's not in forbidden starts + safe_token_candidates = list( + set(range(vocab_size)) - forbidden_start_tokens) + # Pick a random safe token + return np.random.choice(safe_token_candidates) + + def _update_output_token_ids_for_bad_words( metadata: SamplingMetadata, vocab_size: int) -> dict[int, list[int]]: bad_words_last_tokens = {} @@ -104,12 +125,17 @@ def _update_output_token_ids_for_bad_words( prefix_length = len(bad_word_token_ids) - 1 has_bad_words = np.random.choice([True, False]) if has_bad_words: - output_token_ids[-prefix_length:] = bad_word_token_ids[:-1] - bad_words_last_token.append(bad_word_token_ids[-1]) + prefix = bad_word_token_ids[:-1] + output_token_ids[-prefix_length:] = prefix + # Collect all last tokens from other bad words + # that share this prefix + bad_words_last_token.extend( + _collect_suffixes_with_same_prefix( + prefix, bad_words_token_ids)) break # Maximum one update to output_token_ids else: # Make sure no accidental match to bad words - output_token_ids[-1] = (bad_word_token_ids[-2] + - 1) % vocab_size + output_token_ids[-1] = _generate_valid_token_id( + bad_words_token_ids, vocab_size) bad_words_last_tokens[batch_idx] = bad_words_last_token return bad_words_last_tokens From 1ece7f30baa9d94ff57e13d851725acf657a9690 Mon Sep 17 00:00:00 2001 From: Jun-Howie <62869005+Jun-Howie@users.noreply.github.com> Date: Tue, 12 Aug 2025 17:03:53 +0800 Subject: [PATCH 192/932] Fix: AWQ Marlin get_quant_method does not recognize "modules_to_not_convert" (#21888) Signed-off-by: JunHowie Co-authored-by: JunHowie Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/layers/quantization/awq_marlin.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 0fdded0b5a..6cf02658a9 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -10,7 +10,8 @@ import vllm.model_executor.layers.fused_moe # noqa from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) + FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, + UnquantizedFusedMoEMethod) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod, set_weight_attrs) @@ -141,6 +142,9 @@ class AWQMarlinConfig(QuantizationConfig): elif isinstance(layer, FusedMoE): from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) + if is_layer_skipped_awq( + prefix, getattr(self, "modules_to_not_convert", [])): + return UnquantizedFusedMoEMethod(layer.moe_config) if not check_moe_marlin_supports_layer(layer, self.group_size): logger.warning_once( f"Layer '{prefix}' is not supported by AWQMoeMarlin. " @@ -520,4 +524,4 @@ class AWQMoEMethod(FusedMoEMethodBase): expert_map=expert_map, w1_zeros=layer.w13_qzeros, w2_zeros=layer.w2_qzeros, - workspace=layer.workspace) + workspace=layer.workspace) \ No newline at end of file From 46ae7f666699496f45c0349b87f08d5119720951 Mon Sep 17 00:00:00 2001 From: RishiAstra <40644327+RishiAstra@users.noreply.github.com> Date: Tue, 12 Aug 2025 05:04:37 -0400 Subject: [PATCH 193/932] [Bugfix] Mamba2 SSD varlen bug fix initstates decay, improve test, assert chunk pwr 2 (#21783) Signed-off-by: Rishi Astra <40644327+RishiAstra@users.noreply.github.com> --- tests/kernels/mamba/test_mamba_ssm_ssd.py | 17 ++++++++--------- .../layers/mamba/ops/ssd_chunk_scan.py | 6 ++---- .../layers/mamba/ops/ssd_combined.py | 5 +++++ 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py index 67b14a7faa..d2b893ffff 100644 --- a/tests/kernels/mamba/test_mamba_ssm_ssd.py +++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py @@ -187,7 +187,7 @@ def generate_continuous_batched_examples(example_lens_by_batch, [torch.float32, torch.float16, torch.bfloat16]) @pytest.mark.parametrize("n_heads", [3, 4, 11, 16, 32]) @pytest.mark.parametrize("d_head", [5, 8, 19, 32, 128]) -@pytest.mark.parametrize("seq_len_chunk_size", [(119, 17), (128, 32)]) +@pytest.mark.parametrize("seq_len_chunk_size", [(112, 16), (128, 32)]) def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, itype): @@ -253,15 +253,15 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, (8, 8, 16, 32, 16), ]), # mode examples with varied lengths - # odd chunk_size - (64, 29, 2, [(11, 4), (13, 23), (19, 22), - (21, 15)]), # irregular sizes - # large-ish chunk_size (256) (64, 256, 1, [(5, ), (1, ), (1, ), (1, )]), # irregular sizes with small sequences (64, 256, 2, [(5, 30), (1, 2), (1, 2), (1, 2)]), # irregular sizes with small sequences + + # we also need to test some large seqlen + # to catch errors with init states decay + (768, 128, 2, [(138, 225), (138, 225)]), ]) def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, itype): @@ -271,10 +271,9 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, seqlen, chunk_size, num_examples, cases = seq_len_chunk_size_cases - # TODO: the irregular chunk size cases have some issues and require higher - # tolerance. This is to be invesigated - if chunk_size not in {8, 256}: - atol, rtol = 5e-1, 5e-1 + # This test can have larger error for longer sequences + if seqlen > 256: + atol, rtol = 1e-2, 5e-3 else: atol, rtol = 5e-3, 5e-3 diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py index fc2b3b25fd..365139e237 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py @@ -290,10 +290,8 @@ def _chunk_scan_fwd_kernel( # get the cs at the offset boundary # - c_off == 0 is a passthrough dA_cs_m_boundary = tl.load( - dA_cumsum_ptr + - (pid_m * BLOCK_SIZE_M + c_off - 1) * stride_dA_cs_csize, - mask=(((pid_m * BLOCK_SIZE_M + c_off - 1) > -1) - and ((pid_m * BLOCK_SIZE_M + c_off) < chunk_size)), + dA_cumsum_ptr + (c_off - 1) * stride_dA_cs_csize, + mask=(((c_off - 1) > -1) and ((c_off) < chunk_size)), other=0.0).to(tl.float32) if HAS_SEQ_IDX: diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py index ad2853a3d8..fd74cb8372 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py @@ -21,6 +21,10 @@ from .ssd_state_passing import _state_passing_fwd TRITON_22 = version.parse(triton.__version__) >= version.parse('2.2.0') +def is_int_pow_2(n): + return isinstance(n, int) and n > 0 and (n & (n - 1)) == 0 + + def _mamba_chunk_scan_combined_fwd(x, dt, A, @@ -38,6 +42,7 @@ def _mamba_chunk_scan_combined_fwd(x, dt_softplus=False, dt_limit=(0.0, float("inf")), out=None): + assert is_int_pow_2(chunk_size), "chunk_size must be integer power of 2" batch, seqlen, nheads, headdim = x.shape _, _, ngroups, dstate = B.shape assert nheads % ngroups == 0 From 50f2aae1b4afb8799bc6a38254639e031997e61c Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Tue, 12 Aug 2025 17:05:14 +0800 Subject: [PATCH 194/932] [LMCache][Example] Align the PYTHONHASHSEED for prefillers and decoders for KV chunks hashing (#21161) Signed-off-by: zejunchen-zejun --- .../disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh index 1284466a45..682df45d95 100644 --- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh +++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh @@ -15,6 +15,14 @@ else MODEL=$2 fi +# The prefillers and decoders in LMCache use the same hash seed for all chunk keys. +# This seed must be aligned so that decoders can identify and retrieve KV cache +# entries stored by prefillers. +# +# WARNING: Using a fixed hash seed is insecure and makes the application vulnerable to +# denial-of-service attacks. In a production environment, this should be set to a +# secure random value. This is set to a fixed value for demonstration purposes only. +export PYTHONHASHSEED=${VLLM_PYTHON_HASH_SEED:-123} if [[ $1 == "prefiller" ]]; then # Prefiller listens on port 8100 From b8a9d0e4298710c5b3533b411395593dcaaa61c2 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 12 Aug 2025 18:15:33 +0800 Subject: [PATCH 195/932] [Misc] remove GH discussions link (#22722) Signed-off-by: Jee Jee Li --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d9e3ca660f..fd8b02ac1f 100644 --- a/README.md +++ b/README.md @@ -162,7 +162,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs ## Contact Us -- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions) +- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai) - For coordinating contributions and development, please use [Slack](https://slack.vllm.ai) - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature From 007dd90859cc0337510536677418a43d8f66e286 Mon Sep 17 00:00:00 2001 From: Yongye Zhu Date: Tue, 12 Aug 2025 06:21:44 -0400 Subject: [PATCH 196/932] [gpt-oss] Enable gpt-oss on ampere (#22714) Signed-off-by: Yongye Zhu --- .../vllm_add_dummy_platform/dummy_platform.py | 5 +++-- vllm/attention/layer.py | 4 +++- vllm/attention/selector.py | 5 ++++- vllm/model_executor/layers/quantization/mxfp4.py | 2 +- vllm/platforms/cpu.py | 4 ++-- vllm/platforms/cuda.py | 7 +++++-- vllm/platforms/interface.py | 4 ++-- vllm/platforms/rocm.py | 4 ++-- vllm/platforms/tpu.py | 4 ++-- vllm/platforms/xpu.py | 4 ++-- 10 files changed, 26 insertions(+), 17 deletions(-) diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py index e67825f89d..8d0687b49b 100644 --- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py @@ -25,5 +25,6 @@ class DummyPlatform(Platform): compilation_config.custom_ops = ["all"] def get_attn_backend_cls(self, backend_name, head_size, dtype, - kv_cache_dtype, block_size, use_v1, use_mla): - return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend" # noqa E501 \ No newline at end of file + kv_cache_dtype, block_size, use_v1, use_mla, + has_sink): + return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend" # noqa E501 diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index b4c3cbd7c9..1a9c0e26b5 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -138,6 +138,7 @@ class Attention(nn.Module): self.head_size = head_size self.num_kv_heads = num_kv_heads self.sliding_window = sliding_window + self.has_sink = extra_impl_args.get("sinks") is not None quant_method = quant_config.get_quant_method( self, prefix=prefix) if quant_config else None @@ -165,7 +166,8 @@ class Attention(nn.Module): kv_cache_dtype, block_size, is_attention_free, - use_mla=use_mla) + use_mla=use_mla, + has_sink=self.has_sink) else: self.attn_backend = attn_backend diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 508470bb36..3a235ba6e0 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -144,6 +144,7 @@ def get_attn_backend( block_size: int, is_attention_free: bool = False, use_mla: bool = False, + has_sink: bool = False, ) -> type[AttentionBackend]: """Selects which attention backend to use and lazily imports it.""" # Accessing envs.* behind an @lru_cache decorator can cause the wrong @@ -158,6 +159,7 @@ def get_attn_backend( is_attention_free=is_attention_free, use_v1=envs.VLLM_USE_V1, use_mla=use_mla, + has_sink=has_sink, ) @@ -170,6 +172,7 @@ def _cached_get_attn_backend( is_attention_free: bool, use_v1: bool = False, use_mla: bool = False, + has_sink: bool = False, ) -> type[AttentionBackend]: # If there are no attention layers (e.g. we are running Mamba), # use the placeholder NO_ATTENTION @@ -201,7 +204,7 @@ def _cached_get_attn_backend( # get device-specific attn_backend attention_cls = current_platform.get_attn_backend_cls( selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, - use_mla) + use_mla, has_sink) if not attention_cls: raise ValueError( f"Invalid attention backend for {current_platform.device_name}") diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 4e59aef480..03fbcf1583 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -42,7 +42,7 @@ class Mxfp4Config(QuantizationConfig): @classmethod def get_min_capability(cls) -> int: - return 90 + return 80 @classmethod def get_name(cls) -> QuantizationMethods: diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 31a67183ff..0b16a8e1d1 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -91,8 +91,8 @@ class CpuPlatform(Platform): @classmethod def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, dtype: torch.dtype, kv_cache_dtype: Optional[str], - block_size: int, use_v1: bool, - use_mla: bool) -> str: + block_size: int, use_v1: bool, use_mla: bool, + has_sink: bool) -> str: if selected_backend and selected_backend != _Backend.TORCH_SDPA: logger.info("Cannot use %s backend on CPU.", selected_backend) if use_mla: diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index dd9356e399..c876c52a2e 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -222,8 +222,8 @@ class CudaPlatformBase(Platform): @classmethod def get_attn_backend_cls(cls, selected_backend, head_size, dtype, - kv_cache_dtype, block_size, use_v1, - use_mla) -> str: + kv_cache_dtype, block_size, use_v1, use_mla, + has_sink) -> str: if use_mla: # TODO(lucas): refactor to be more concise # we should probably consider factoring out V1 here @@ -321,6 +321,9 @@ class CudaPlatformBase(Platform): # FlashAttention is the default for SM 8.0+ GPUs if cls.has_device_capability(80): + if has_sink: + logger.info_once("Using Triton backend on V1 engine.") + return TRITON_ATTN_VLLM_V1 if is_default_backend_supported := is_attn_backend_supported( FLASH_ATTN_V1, head_size, dtype, allow_import_error=False): diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index a85b583abc..91d5314900 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -196,8 +196,8 @@ class Platform: @classmethod def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, dtype: torch.dtype, kv_cache_dtype: Optional[str], - block_size: int, use_v1: bool, - use_mla: bool) -> str: + block_size: int, use_v1: bool, use_mla: bool, + has_sink: bool) -> str: """Get the attention backend class of a device.""" return "" diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index d26e4b3350..8005830f55 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -188,8 +188,8 @@ class RocmPlatform(Platform): @classmethod def get_attn_backend_cls(cls, selected_backend, head_size, dtype, - kv_cache_dtype, block_size, use_v1, - use_mla) -> str: + kv_cache_dtype, block_size, use_v1, use_mla, + has_sink) -> str: if use_mla: from vllm.attention.backends.rocm_aiter_mla import ( is_aiter_mla_enabled) diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 146801c9d7..c56096d936 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -46,8 +46,8 @@ class TpuPlatform(Platform): @classmethod def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, dtype: torch.dtype, kv_cache_dtype: Optional[str], - block_size: int, use_v1: bool, - use_mla: bool) -> str: + block_size: int, use_v1: bool, use_mla: bool, + has_sink) -> str: if (selected_backend != _Backend.PALLAS and selected_backend != _Backend.PALLAS_VLLM_V1): logger.info("Cannot use %s backend on TPU.", selected_backend) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index d8a663f2f0..abd58dbbcb 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -35,8 +35,8 @@ class XPUPlatform(Platform): @classmethod def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, dtype: torch.dtype, kv_cache_dtype: Optional[str], - block_size: int, use_v1: bool, - use_mla: bool) -> str: + block_size: int, use_v1: bool, use_mla: bool, + has_sink: bool) -> str: if selected_backend is not None and selected_backend != _Backend.IPEX: logger.info("Cannot use %s backend on XPU.", selected_backend) use_v1 = envs.VLLM_USE_V1 From 767e63b860dcb8952779f6035d2b215b53dd744d Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 12 Aug 2025 12:25:55 +0100 Subject: [PATCH 197/932] [Docs] Improve docs navigation (#22720) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .gitignore | 3 ++- docs/.nav.yml | 22 +++++++--------------- docs/README.md | 11 +++++++++++ docs/examples/README.md | 7 +++++++ docs/mkdocs/stylesheets/extra.css | 7 +++++++ docs/usage/README.md | 4 +++- mkdocs.yaml | 5 +++-- 7 files changed, 40 insertions(+), 19 deletions(-) create mode 100644 docs/examples/README.md diff --git a/.gitignore b/.gitignore index 5dc0f04b6f..721dd7536b 100644 --- a/.gitignore +++ b/.gitignore @@ -150,7 +150,8 @@ venv.bak/ # mkdocs documentation /site docs/argparse -docs/examples +docs/examples/* +!docs/examples/README.md # mypy .mypy_cache/ diff --git a/docs/.nav.yml b/docs/.nav.yml index acedc32c30..dbac0e12f1 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -1,25 +1,17 @@ nav: - - Home: - - vLLM: README.md + - Home: README.md + - User Guide: + - usage/README.md - Getting Started: - getting_started/quickstart.md - getting_started/installation - Examples: + - examples/README.md - Offline Inference: examples/offline_inference - Online Serving: examples/online_serving - Others: examples/others - - Quick Links: - - User Guide: usage/README.md - - Developer Guide: contributing/README.md - - API Reference: api/README.md - - CLI Reference: cli/README.md - - Timeline: - - Roadmap: https://roadmap.vllm.ai - - Releases: https://github.com/vllm-project/vllm/releases - - User Guide: - - Summary: usage/README.md - - usage/v1_guide.md - General: + - usage/v1_guide.md - usage/* - Inference and Serving: - serving/offline_inference.md @@ -32,7 +24,7 @@ nav: - deployment/integrations - Training: training - Configuration: - - Summary: configuration/README.md + - configuration/README.md - configuration/* - Models: - models/supported_models.md @@ -45,7 +37,7 @@ nav: - features/* - features/quantization - Developer Guide: - - Summary: contributing/README.md + - contributing/README.md - General: - glob: contributing/* flatten_single_child_sections: true diff --git a/docs/README.md b/docs/README.md index 6823008ed3..e8d2fd953a 100644 --- a/docs/README.md +++ b/docs/README.md @@ -21,6 +21,17 @@ vLLM is a fast and easy-to-use library for LLM inference and serving. Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry. +Where to get started with vLLM depends on the type of user. If you are looking to: + +- Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md) +- Build applications with vLLM, we recommend starting with the [User Guide](./usage) +- Build vLLM, we recommend starting with [Developer Guide](./contributing) + +For information about the development of vLLM, see: + +- [Roadmap](https://roadmap.vllm.ai) +- [Releases](https://github.com/vllm-project/vllm/releases) + vLLM is fast with: - State-of-the-art serving throughput diff --git a/docs/examples/README.md b/docs/examples/README.md new file mode 100644 index 0000000000..34e4dfd408 --- /dev/null +++ b/docs/examples/README.md @@ -0,0 +1,7 @@ +# Examples + +vLLM's examples are split into three categories: + +- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference/) +- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving/) +- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others/) diff --git a/docs/mkdocs/stylesheets/extra.css b/docs/mkdocs/stylesheets/extra.css index fb44d9cdcf..6a1979b241 100644 --- a/docs/mkdocs/stylesheets/extra.css +++ b/docs/mkdocs/stylesheets/extra.css @@ -23,6 +23,13 @@ a:not(:has(svg)):not(.md-icon):not(.autorefs-external) { } } +a[href*="localhost"]::after, +a[href*="127.0.0.1"]::after, +a[href*="org.readthedocs.build"]::after, +a[href*="docs.vllm.ai"]::after { + display: none !important; +} + /* Light mode: darker section titles */ body[data-md-color-scheme="default"] .md-nav__item--section > label.md-nav__link .md-ellipsis { color: rgba(0, 0, 0, 0.7) !important; diff --git a/docs/usage/README.md b/docs/usage/README.md index 681db57d8e..83aea12181 100644 --- a/docs/usage/README.md +++ b/docs/usage/README.md @@ -1,6 +1,8 @@ # Using vLLM -vLLM supports the following usage patterns: +First, vLLM must be [installed](../getting_started/installation) for your chosen device in either a Python or Docker environment. + +Then, vLLM supports the following usage patterns: - [Inference and Serving](../serving/offline_inference.md): Run a single instance of a model. - [Deployment](../deployment/docker.md): Scale up model instances for production. diff --git a/mkdocs.yaml b/mkdocs.yaml index 3a64888fb4..47fe1ebce9 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -34,13 +34,14 @@ theme: - content.action.edit - content.code.copy - content.tabs.link + - navigation.instant + - navigation.instant.progress - navigation.tracking - navigation.tabs - navigation.tabs.sticky - navigation.sections - - navigation.prune - - navigation.top - navigation.indexes + - navigation.top - search.highlight - search.share - toc.follow From d030b01548d52a5e3afe56fdb8ce7a367b9799e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Tue, 12 Aug 2025 14:37:30 +0200 Subject: [PATCH 198/932] [BugFix][Nixl][PD] Fix heterogenous TP (#22663) Signed-off-by: NickLucche Co-authored-by: Nick Hill --- .../kv_transfer/kv_connector/factory.py | 37 ++++++++++++------- .../kv_transfer/kv_connector/utils.py | 11 ++++-- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 01673a0d7c..584fc1d655 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -4,13 +4,17 @@ import importlib from typing import TYPE_CHECKING, Callable +# yapf: disable import vllm.envs as envs -from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase +from vllm.distributed.kv_transfer.kv_connector.base import ( + KVConnectorBase, KVConnectorBaseType) from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole from vllm.logger import init_logger +# yapf: enable + if TYPE_CHECKING: - from vllm.config import VllmConfig + from vllm.config import KVTransferConfig, VllmConfig logger = init_logger(__name__) @@ -42,17 +46,7 @@ class KVConnectorFactory: f"but found {envs.VLLM_USE_V1=}") kv_transfer_config = config.kv_transfer_config - connector_name = kv_transfer_config.kv_connector - if connector_name in cls._registry: - connector_cls = cls._registry[connector_name]() - else: - connector_module_path = kv_transfer_config.kv_connector_module_path - if connector_module_path is None: - raise ValueError( - f"Unsupported connector type: {connector_name}") - connector_module = importlib.import_module(connector_module_path) - connector_cls = getattr(connector_module, connector_name) - assert issubclass(connector_cls, KVConnectorBase) + connector_cls = cls.get_connector_class(kv_transfer_config) logger.info("Creating v1 connector with name: %s and engine_id: %s", connector_cls.__name__, kv_transfer_config.engine_id) # NOTE(Kuntai): v1 connector is explicitly separated into two roles. @@ -65,6 +59,23 @@ class KVConnectorFactory: # We build separately to enforce strict separation return connector_cls(config, role) + @classmethod + def get_connector_class( + cls, kv_transfer_config: "KVTransferConfig" + ) -> type[KVConnectorBaseType]: + """Get the connector class by name.""" + connector_name = kv_transfer_config.kv_connector + if connector_name in cls._registry: + connector_cls = cls._registry[connector_name]() + else: + connector_module_path = kv_transfer_config.kv_connector_module_path + if connector_module_path is None: + raise ValueError( + f"Unsupported connector type: {connector_name}") + connector_module = importlib.import_module(connector_module_path) + connector_cls = getattr(connector_module, connector_name) + return connector_cls + # Register various connectors here. # The registration should not be done in each individual file, as we want to diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 1da41790f9..2364400b3d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -13,8 +13,8 @@ import torch import vllm.envs as envs from vllm import _custom_ops as ops from vllm.config import VllmConfig, get_current_vllm_config -from vllm.distributed.kv_transfer.kv_connector.v1.base import ( - KVConnectorBase_V1) +from vllm.distributed.kv_transfer.kv_connector.factory import ( + KVConnectorFactory) from vllm.logger import init_logger from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput @@ -106,8 +106,9 @@ def get_kv_connector_cache_layout(): vllm_config = get_current_vllm_config() kv_config = vllm_config.kv_transfer_config if kv_config is not None: - required_kvcache_layout = ( - KVConnectorBase_V1.get_required_kvcache_layout(vllm_config)) + connector_cls = KVConnectorFactory.get_connector_class(kv_config) + required_kvcache_layout = connector_cls.get_required_kvcache_layout( + vllm_config) if required_kvcache_layout is not None: return required_kvcache_layout logger.info_once("Connectors do not specify a " \ @@ -143,6 +144,8 @@ class KVOutputAggregator: finished_recving = set[str]() for output in outputs: output = output.kv_connector_output + if not output: + continue update_finished_set(output.finished_sending, self._send_remaining_count, finished_sending) update_finished_set(output.finished_recving, From 80bb1e8afe950342e93b7262e7bf25eb6d29b287 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 12 Aug 2025 13:38:48 +0100 Subject: [PATCH 199/932] Officially support SmolLM3 using the Transformers backend (#22665) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/models/supported_models.md | 6 ++++++ tests/models/registry.py | 1 + vllm/model_executor/models/registry.py | 3 +++ 3 files changed, 10 insertions(+) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index ea36331542..a24fa4bcce 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -409,6 +409,12 @@ th { | `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | ✅︎ | | `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ | +Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it! + +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | +|--------------|--------|-------------------|----------------------|---------------------------|---------------------| +| `SmolLM3ForCausalLM` | SmolLM3 | `HuggingFaceTB/SmolLM3-3B` | ✅︎ | ✅︎ | ✅︎ | + !!! note Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. diff --git a/tests/models/registry.py b/tests/models/registry.py index eae5829030..d7d20d1f3a 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -291,6 +291,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"), "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"), "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"), + "SmolLM3ForCausalLM": _HfExamplesInfo("HuggingFaceTB/SmolLM3-3B"), "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"), # noqa: E501 "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"), "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"), diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 279e045a70..64dbde4916 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -271,6 +271,9 @@ _SPECULATIVE_DECODING_MODELS = { } _TRANSFORMERS_SUPPORTED_MODELS = { + # Text generation models + "SmolLM3ForCausalLM": ("transformers", "TransformersForCausalLM"), + # Multimodal models "Emu3ForConditionalGeneration": ("transformers", "TransformersForMultimodalLM"), # noqa: E501 } From f7ad6a1eb3deb9ca70a6bce3705dbd16cf9d8b28 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 12 Aug 2025 20:42:58 +0800 Subject: [PATCH 200/932] [CI Failure] fix tests/entrypoints/openai/test_skip_tokenizer.py (#22708) Signed-off-by: wang.yuqi --- .../model_executor/models/prithvi_geospatial_mae.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index 304a9e987e..20f423cc76 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -25,11 +25,11 @@ import torch.nn as nn from transformers import BatchFeature from vllm.config import VllmConfig -from vllm.model_executor.layers.pooler import (AllPool, PoolerHead, - PoolerIdentity, SimplePooler) +from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import ( - IsAttentionFree, MultiModalEmbeddings, SupportsMultiModalWithRawInput) + IsAttentionFree, MultiModalEmbeddings, SupportsMultiModalWithRawInput, + default_pooling_type) from vllm.model_executor.models.utils import AutoWeightsLoader from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -142,6 +142,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor): ) +@default_pooling_type("All") @MULTIMODAL_REGISTRY.register_processor( PrithviGeoSpatialMAEMultiModalProcessor, info=PrithviGeoSpatialMAEProcessingInfo, @@ -198,7 +199,11 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, "Only SemanticSegmentationTask is supported for now " "by PrithviGeospatialMAE.") - self.pooler = SimplePooler(AllPool(), PoolerHead(PoolerIdentity())) + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler( + {"encode": Pooler.for_encode(pooler_config)}, ) def _parse_and_validate_multimodal_data( self, **kwargs) -> tuple[torch.Tensor, Optional[torch.Tensor]]: From 67c153b88a2129c3b6fb78af09901738f1034a68 Mon Sep 17 00:00:00 2001 From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com> Date: Tue, 12 Aug 2025 20:50:59 +0800 Subject: [PATCH 201/932] Fix Llama4 FlashInfer FP4 MoE issues (#22511) Signed-off-by: Po-Han Huang --- .../layers/fused_moe/flashinfer_cutlass_moe.py | 2 -- .../fused_moe/flashinfer_cutlass_prepare_finalize.py | 7 ++++++- vllm/model_executor/layers/quantization/modelopt.py | 5 +++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index 3e79a1a8c2..4e3e15a35a 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -170,8 +170,6 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): "w1_scale and w2_scale must not " "be None for FlashInferExperts") - assert not apply_router_weight_on_input - quant_scales = [ a1_gscale, w1_scale.view(torch.int32), diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py index 7fdb465c45..36aca8cf74 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py @@ -60,7 +60,12 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: - assert not apply_router_weight_on_input + if apply_router_weight_on_input: + topk = topk_ids.size(1) + # TODO: this only works for topK=1, will need to update for topK>1 + assert topk == 1, \ + "apply_router_weight_on_input is only implemented for topk=1" + a1.mul_(topk_weights.to(a1.dtype)) (a1_gscale, use_dp, local_tokens) = extract_required_args( extra_prepare_args, ['a1_gscale', 'use_dp', 'local_tokens']) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 147b275eaf..bed5022267 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1299,8 +1299,9 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): output2_scale_scalar=layer.g2_alphas.data, num_experts=global_num_experts, top_k=top_k, - n_group=num_expert_group, - topk_group=topk_group, + n_group=num_expert_group + if num_expert_group is not None else 0, + topk_group=topk_group if topk_group is not None else 0, intermediate_size=layer.intermediate_size_per_partition, local_expert_offset=layer.ep_rank * layer.local_num_experts, local_num_experts=layer.local_num_experts, From 3d9d40efdeea7011dc3c496ad9d55cfdc90aff92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Tue, 12 Aug 2025 16:30:17 +0200 Subject: [PATCH 202/932] [Bugfix][CI] Fix `test_remote_decode_lifecycle.py::test_short_prompt_lifecycle` (#22727) Signed-off-by: NickLucche --- tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py index 1bddfef0f2..2f8228864e 100644 --- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py @@ -130,8 +130,9 @@ def test_short_prompt_lifecycle(): # Confirm we do not have any memory leaks after req lifecycle. # We need to mark sending finish to clear data for persistent batch. scheduler_output = scheduler.schedule() - model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) - model_runner_output.finished_sending = [request.request_id] + # Use create_model_runner_output to pass kv_connector_output along + model_runner_output = create_model_runner_output( + reqs=[request], finished_sending=[request.request_id]) scheduler.update_from_output(scheduler_output, model_runner_output) assert_scheduler_empty(scheduler) From e5d3d63c42aa85025dfb1b5dec369c0c856a4efa Mon Sep 17 00:00:00 2001 From: Daniel Serebrenik <74646983+pliops-daniels@users.noreply.github.com> Date: Tue, 12 Aug 2025 17:41:37 +0300 Subject: [PATCH 203/932] [Benchmark] Fix terminal colors in benchmark_serving_multi_turn (python 3.12) (#22730) Signed-off-by: daniels --- benchmarks/multi_turn/bench_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/multi_turn/bench_utils.py b/benchmarks/multi_turn/bench_utils.py index d4d3c1ca8c..e959a4be71 100644 --- a/benchmarks/multi_turn/bench_utils.py +++ b/benchmarks/multi_turn/bench_utils.py @@ -4,7 +4,7 @@ import logging from enum import Enum -class Color(str, Enum): +class Color(Enum): RED = "\033[91m" GREEN = "\033[92m" BLUE = "\033[94m" @@ -13,6 +13,9 @@ class Color(str, Enum): YELLOW = "\033[93m" RESET = "\033[0m" + def __str__(self): + return self.value + TEXT_SEPARATOR = "-" * 100 From 5a4b4b3729e1a1594bf56d38b7c8d3f556754634 Mon Sep 17 00:00:00 2001 From: Rahul Tuli Date: Tue, 12 Aug 2025 21:54:52 +0530 Subject: [PATCH 204/932] Add: `SupportsEagle3` interface for explicit EAGLE3 support (#22642) Signed-off-by: Rahul Tuli --- .../speculators/test_eagle3.py | 18 ++++++- vllm/model_executor/models/interfaces.py | 53 +++++++++++++++++++ vllm/model_executor/models/llama.py | 4 +- vllm/model_executor/models/qwen3.py | 4 +- vllm/v1/worker/gpu_model_runner.py | 10 +++- 5 files changed, 81 insertions(+), 8 deletions(-) diff --git a/tests/speculative_decoding/speculators/test_eagle3.py b/tests/speculative_decoding/speculators/test_eagle3.py index c46ac7a88b..45ddb21787 100644 --- a/tests/speculative_decoding/speculators/test_eagle3.py +++ b/tests/speculative_decoding/speculators/test_eagle3.py @@ -3,12 +3,20 @@ import pytest import torch +from vllm.model_executor.models.interfaces import supports_eagle3 + @pytest.mark.parametrize( "model_path", [("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")]) -def test_llama(vllm_runner, example_prompts, model_path): +def test_llama(vllm_runner, example_prompts, model_path, monkeypatch): + # Set environment variable for V1 engine serialization + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model: + eagle3_supported = vllm_model.apply_model(supports_eagle3) + assert eagle3_supported + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens=20) print(vllm_outputs) @@ -18,8 +26,14 @@ def test_llama(vllm_runner, example_prompts, model_path): @pytest.mark.parametrize( "model_path", [("nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized")]) -def test_qwen(vllm_runner, example_prompts, model_path): +def test_qwen(vllm_runner, example_prompts, model_path, monkeypatch): + # Set environment variable for V1 engine serialization + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model: + eagle3_supported = vllm_model.apply_model(supports_eagle3) + assert eagle3_supported + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens=20) print(vllm_outputs) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 46caf3fce4..c425488f83 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -823,3 +823,56 @@ def supports_v0_only( model: Union[type[object], object], ) -> Union[TypeIs[type[SupportsV0Only]], TypeIs[SupportsV0Only]]: return getattr(model, "supports_v0_only", False) + + +@runtime_checkable +class SupportsEagle3(Protocol): + """The interface required for models that support + EAGLE3 speculative decoding.""" + + supports_eagle3: ClassVar[Literal[True]] = True + """ + A flag that indicates this model supports EAGLE3 + speculative decoding. + + Note: + There is no need to redefine this flag if this class is in the + MRO of your model class. + """ + + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: + """ + Set which layers should output auxiliary + hidden states for EAGLE3. + + Args: + layers: Tuple of layer indices that should output auxiliary + hidden states. + """ + ... + + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: + """ + Get the layer indices that should output auxiliary hidden states + for EAGLE3. + + Returns: + Tuple of layer indices for auxiliary hidden state outputs. + """ + ... + + +@overload +def supports_eagle3(model: type[object]) -> TypeIs[type[SupportsEagle3]]: + ... + + +@overload +def supports_eagle3(model: object) -> TypeIs[SupportsEagle3]: + ... + + +def supports_eagle3( + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsEagle3]], TypeIs[SupportsEagle3]]: + return isinstance(model, SupportsEagle3) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index bc511d8339..24cd448d83 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -49,7 +49,7 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP +from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, @@ -463,7 +463,7 @@ class LlamaModel(nn.Module): return loaded_params -class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): +class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"] diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 0ad50640bb..2060206633 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -44,7 +44,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP +from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .qwen2 import Qwen2MLP as Qwen3MLP from .qwen2 import Qwen2Model from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index, @@ -261,7 +261,7 @@ class Qwen3Model(Qwen2Model): decoder_layer_type=Qwen3DecoderLayer) -class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): +class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ed4d6bcb09..2e1cc37b1b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -35,6 +35,7 @@ from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaBase from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader from vllm.model_executor.models.interfaces import (is_mixture_of_experts, + supports_eagle3, supports_transcription) from vllm.model_executor.models.interfaces_base import ( VllmModelForPooling, is_pooling_model, is_text_generation_model) @@ -1981,8 +1982,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logger.info("Loading drafter model...") self.drafter.load_model(self.model) if self.use_aux_hidden_state_outputs: - self.model.set_aux_hidden_state_layers( - self.model.get_eagle3_aux_hidden_state_layers()) + if supports_eagle3(self.model): + self.model.set_aux_hidden_state_layers( + self.model.get_eagle3_aux_hidden_state_layers()) + else: + raise RuntimeError( + "Model does not support EAGLE3 interface but " + "aux_hidden_state_outputs was requested") time_after_load = time.perf_counter() self.model_memory_usage = m.consumed_memory logger.info("Model loading took %.4f GiB and %.6f seconds", From c42fe0b63a29d3ec157089c9784643000dde4aec Mon Sep 17 00:00:00 2001 From: TeeKen Lau <13831887+teekenl@users.noreply.github.com> Date: Wed, 13 Aug 2025 02:34:41 +1000 Subject: [PATCH 205/932] Add more test scenario for tensor schema (#22733) Signed-off-by: teekenl --- tests/utils_/test_tensor_schema.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/utils_/test_tensor_schema.py b/tests/utils_/test_tensor_schema.py index 69744921b1..6aa781c156 100644 --- a/tests/utils_/test_tensor_schema.py +++ b/tests/utils_/test_tensor_schema.py @@ -33,6 +33,31 @@ def test_tensor_schema_constant_dim_failure(): ) +def test_tensor_schema_invalid_types_in_list(): + with pytest.raises(ValueError, match="is not a torch.Tensor"): + Phi3VImagePixelInputs( + data=[ + torch.randn(64, 3, 32, 32), + "not_a_tensor", + torch.randn(64, 3, 32, 32), + ], + image_sizes=torch.randint(0, 256, (3, 2)), + ) + + +def test_tensor_schema_rank_mismatch(): + with pytest.raises(ValueError, match="has rank 3 but expected 5"): + Phi3VImagePixelInputs( + data=torch.randn(16, 64, 3), + image_sizes=torch.randint(0, 256, (16, 2)), + ) + + +def test_tensor_schema_missing_required_field(): + with pytest.raises(ValueError, match="Required field 'data' is missing"): + Phi3VImagePixelInputs(image_sizes=torch.randint(0, 256, (16, 2)), ) + + def test_tensor_schema_symbolic_dim_mismatch(): with pytest.raises(ValueError, match="expected 'bn'=12, got 16"): Phi3VImagePixelInputs( From dab4f9f764119117c8ea1af0a3b5bcbb1c80bf76 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 12 Aug 2025 12:50:31 -0400 Subject: [PATCH 206/932] [Chore] Update CODEOWNERS to include @yewentao256 for CUDA kernels, attention backends, quantization, and related tests (#22741) Signed-off-by: yewentao256 --- .github/CODEOWNERS | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 0a7f8e8be4..a0a327319a 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -9,7 +9,7 @@ /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth +/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 /vllm/multimodal @DarkLight1337 @ywang96 /vllm/vllm_flash_attn @LucasWilkinson /vllm/lora @jeejeelee @@ -20,7 +20,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson # Any change to the VllmConfig changes can have a large user-facing impact, # so spam a lot of people -/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor +/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg # vLLM V1 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @@ -34,16 +34,16 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm -/tests/kernels @tlrmchlsmth @WoosukKwon +/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256 /tests/models @DarkLight1337 @ywang96 /tests/multi_step @alexm-redhat @comaniac /tests/multimodal @DarkLight1337 @ywang96 /tests/prefix_caching @comaniac @KuntaiDu -/tests/quantization @mgoin @robertgshaw2-redhat +/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 /tests/test_inputs.py @DarkLight1337 @ywang96 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb @aarnphm -/tests/weight_loading @mgoin @youkaichao +/tests/weight_loading @mgoin @youkaichao @yewentao256 /tests/lora @jeejeelee # Docs From 6bd8ebf026600e9851026c8850f88c5e10acfab1 Mon Sep 17 00:00:00 2001 From: Xiaozhu Meng Date: Tue, 12 Aug 2025 12:53:36 -0700 Subject: [PATCH 207/932] [Kernel][AMD] Avoid D2H copy and cumsum kernel (#22683) Signed-off-by: Xiaozhu Signed-off-by: Michael Goin Co-authored-by: Michael Goin Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/v1/attention/backends/rocm_aiter_fa.py | 32 +++++++++++++-------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index abe0517450..e8bffbef44 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -214,12 +214,14 @@ class AiterFlashAttentionMetadata: # |-- query_len ---| num_actual_tokens: int # Number of tokens excluding padding. + num_actual_kv_tokens: int max_query_len: int query_start_loc: torch.Tensor max_seq_len: int seq_lens: torch.Tensor slot_mapping: torch.Tensor block_table: torch.Tensor + cu_seq_lens: Optional[torch.Tensor] # For cascade attention. use_cascade: bool @@ -272,6 +274,20 @@ class AiterFlashAttentionMetadataBuilder( seq_lens = common_attn_metadata.seq_lens block_table_tensor = common_attn_metadata.block_table_tensor slot_mapping = common_attn_metadata.slot_mapping + if max_query_len > 1: + # We pre-compute cumulative seq len needed for prefill attention + # here to avoid recomputing it for every layer + cu_seq_lens = torch.zeros(seq_lens.shape[0] + 1, + dtype=torch.int32, + device=seq_lens.device) + torch.cumsum(seq_lens, + dim=0, + dtype=cu_seq_lens.dtype, + out=cu_seq_lens[1:]) + num_actual_kv_tokens = int(cu_seq_lens[-1].item()) + else: + cu_seq_lens = None + num_actual_kv_tokens = 0 def schedule(batch_size, cu_query_lens, max_query_len, seqlens, max_seq_len, causal): @@ -281,12 +297,14 @@ class AiterFlashAttentionMetadataBuilder( attn_metadata = AiterFlashAttentionMetadata( num_actual_tokens=num_actual_tokens, + num_actual_kv_tokens=num_actual_kv_tokens, max_query_len=max_query_len, query_start_loc=query_start_loc, max_seq_len=max_seq_len, seq_lens=seq_lens, block_table=block_table_tensor, slot_mapping=slot_mapping, + cu_seq_lens=cu_seq_lens, use_cascade=use_cascade, common_prefix_len=common_prefix_len, total_tokens=self.total_tokens, @@ -475,16 +493,6 @@ class AiterFlashAttentionImpl(AttentionImpl): block_table = attn_metadata.block_table if max_seqlen_q > 1: - - cu_seq_lens = torch.zeros(seqused_k.shape[0] + 1, - dtype=torch.int32, - device=query.device) - - torch.cumsum(seqused_k, - dim=0, - dtype=cu_seq_lens.dtype, - out=cu_seq_lens[1:]) - torch.ops.vllm.flash_attn_varlen_func( query[:num_actual_tokens], key_cache, @@ -497,10 +505,10 @@ class AiterFlashAttentionImpl(AttentionImpl): alibi_slopes=self.alibi_slopes, window_size=self.sliding_window, block_table=block_table, - cu_seqlens_k=cu_seq_lens, + cu_seqlens_k=attn_metadata.cu_seq_lens, k_scale=layer._k_scale, v_scale=layer._v_scale, - total_tokens=attn_metadata.total_tokens, + total_tokens=attn_metadata.num_actual_kv_tokens, ) _, num_heads, head_size = query.shape From 422f22e01265b0ba6a99763e0b69f8dbba06b371 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Tue, 12 Aug 2025 21:53:52 +0200 Subject: [PATCH 208/932] [CI][Nixl] Check kv cache layout during handshake (#22745) Signed-off-by: NickLucche --- .../kv_connector/unit/test_nixl_connector.py | 46 +++++++++++++++++++ .../kv_connector/v1/nixl_connector.py | 13 ++++-- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index c673983235..3860d7c857 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -419,6 +419,52 @@ class TestNixlHandshake: return raise TimeoutError("Took too long to complete async handshake.") + @patch( + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", + FakeNixlWrapper) + def test_handshake_fails_on_kv_cache_layout_mismatch(self, dist_init): + """ + Verify that adding a remote agent fails if kv_cache_layout differs. + This test is only relevant for heterogeneous TP. + """ + vllm_config = create_vllm_config() + + # Mock TP world size to 2 to force heterogeneous TP when + # remote_tp_size=1 + with patch( + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.get_tensor_model_parallel_world_size", # noqa: E501 + return_value=2): + # Initialize connector and worker (with fake NIXL wrapper) + connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector.connector_worker = FakeNixlConnectorWorker( + vllm_config, connector.engine_id, hand_shake_latency=0) + worker = connector.connector_worker + + # Minimal local registration params used by add_remote_agent + worker.slot_size_bytes = 4096 + worker.block_len = worker.slot_size_bytes * worker.block_size + worker.num_blocks = 1 + worker.dst_num_blocks[worker.engine_id] = worker.num_blocks + + # Metadata with different kv_cache_layout than local worker + mismatched_layout = "HND" if worker.kv_cache_layout != "HND" \ + else "NHD" + meta = NixlAgentMetadata( + engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID, + agent_metadata=FakeNixlWrapper.AGENT_METADATA, + kv_caches_base_addr=[0], + num_blocks=1, + block_len=worker.block_len, + attn_backend_name=worker.backend_name, + kv_cache_layout=mismatched_layout, + ) + + # We don't check layout for homogeneous TP and MLA for now, as the + # whole block is moved. + worker.add_remote_agent(meta, remote_tp_size=2) + with pytest.raises(AssertionError): + worker.add_remote_agent(meta, remote_tp_size=1) + # NOTE: resource cleanup in mp backend is a bit finicky, so the order in which # we put here is important. First run ray, it will clean up the resources, then diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index a6eeb27853..4f51229ffb 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -30,6 +30,7 @@ from vllm.forward_context import ForwardContext from vllm.logger import init_logger from vllm.platforms import _Backend, current_platform from vllm.utils import make_zmq_path, make_zmq_socket +from vllm.v1.attention.backends.utils import get_kv_cache_layout from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.request import RequestStatus @@ -73,6 +74,7 @@ class NixlAgentMetadata( num_blocks: int block_len: int attn_backend_name: str + kv_cache_layout: str @dataclass @@ -538,7 +540,9 @@ class NixlConnectorWorker: attn_backend = backend_name_to_enum(self.backend_name) self._use_flashinfer = attn_backend == _Backend.FLASHINFER_VLLM_V1 self._use_pallas_v1 = attn_backend == _Backend.PALLAS_VLLM_V1 + self.kv_cache_layout = get_kv_cache_layout() logger.debug("Detected attention backend %s", self.backend_name) + logger.debug("Detected kv cache layout %s", self.kv_cache_layout) self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size} # With heterogeneous TP, P must wait for all assigned D TP workers to @@ -839,7 +843,8 @@ class NixlConnectorWorker: kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id], num_blocks=self.num_blocks, block_len=self.block_len, - attn_backend_name=self.backend_name) + attn_backend_name=self.backend_name, + kv_cache_layout=self.kv_cache_layout) ready_event = threading.Event() self._nixl_handshake_listener_t = threading.Thread( target=self._nixl_handshake_listener, @@ -900,8 +905,7 @@ class NixlConnectorWorker: self._tp_size[engine_id] = remote_tp_size else: assert self._tp_size[engine_id] == remote_tp_size - # We may eventually enable this after asserting equality in cache - # layout and close outputs. + # TODO We may eventually want to skip enforcing the same attn backend. assert nixl_agent_meta.attn_backend_name == self.backend_name remote_agent_name = self.nixl_wrapper.add_remote_agent( @@ -930,6 +934,9 @@ class NixlConnectorWorker: if self._use_flashinfer: # Account for joint KV in FlashInfer. remote_block_size //= 2 + if tp_ratio > 1: + # Heterogeneous TP expects same kv_cache_layout. + assert nixl_agent_meta.kv_cache_layout == self.kv_cache_layout assert nixl_agent_meta.block_len == self.block_len * tp_ratio, ( "Remote P worker KV layer cache must be of shape [2, N, " From 6534d2fc9773db101e0cb6d2bd9617bfd41e7876 Mon Sep 17 00:00:00 2001 From: zifeitong Date: Tue, 12 Aug 2025 12:54:42 -0700 Subject: [PATCH 209/932] Fix torch version check for SM100 mxfp4 (#22535) Signed-off-by: Zifei Tong Signed-off-by: mgoin Co-authored-by: mgoin --- vllm/model_executor/layers/fused_moe/layer.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index d5a89655e3..fb38fb91ea 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -741,12 +741,14 @@ class FusedMoE(torch.nn.Module): # we padding globally so EP buffer allocation works if quant_config and quant_config.get_name() == "mxfp4": - if not is_torch_equal_or_newer("2.8.0"): - raise RuntimeError("Mxfp4 on hopper requires torch >= 2.8.0") - if current_platform.is_device_capability( - 90) and not has_triton_kernels(): - raise NotImplementedError( - "Triton kernels must be installed for mxfp4 on hopper") + if not current_platform.is_device_capability(100): + if not is_torch_equal_or_newer("2.8.0"): + raise RuntimeError( + "Mxfp4 on non-blackwell requires torch >= 2.8.0") + if not has_triton_kernels(): + raise NotImplementedError( + "triton_kernels must be installed for " + "mxfp4 on non-blackwell") if (current_platform.is_rocm() or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): From 53c730286c5ad86a6d78d4a4d8a2cd7042725d24 Mon Sep 17 00:00:00 2001 From: RUTHLESS-BOT Date: Wed, 13 Aug 2025 04:31:48 +0800 Subject: [PATCH 210/932] [Misc] parametrize 'dtype' in test_flash_mla (#22641) Signed-off-by: RUTHLESS-BOT Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- tests/kernels/attention/test_flashmla.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/kernels/attention/test_flashmla.py b/tests/kernels/attention/test_flashmla.py index 21b08e45fd..81841be583 100644 --- a/tests/kernels/attention/test_flashmla.py +++ b/tests/kernels/attention/test_flashmla.py @@ -35,11 +35,10 @@ FLASH_MLA_UNSUPPORTED_REASON = is_flashmla_supported()[1] \ @pytest.mark.parametrize("block_size", [64]) @pytest.mark.parametrize("causal", [True]) @pytest.mark.parametrize("varlen", [False, True]) +@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) @torch.inference_mode() def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal, - varlen): - # TODO: parametrize using pytest - dtype = torch.bfloat16 + varlen, dtype): device = torch.device("cuda:0") torch.set_default_dtype(dtype) torch.set_default_device(device) @@ -48,7 +47,7 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal, random.seed(0) print(f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, " - f"{d=}, {dv=}, {causal=}, {varlen=}") + f"{d=}, {dv=}, {causal=}, {varlen=}, {dtype=}") cache_seqlens = torch.full((b, ), mean_sk, dtype=torch.int32) if varlen: From ba81acbdc1eec643ba815a76628ae3e4b2263b76 Mon Sep 17 00:00:00 2001 From: Frank Wang <41319051+frankwang28@users.noreply.github.com> Date: Tue, 12 Aug 2025 15:43:06 -0700 Subject: [PATCH 211/932] [Bugfix] Bump DeepGEMM Version to Fix SMXX Layout Issues (#22606) Signed-off-by: frankwang28 --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index b96d50f0a1..a20a4bfb2b 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -432,7 +432,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Install DeepGEMM from source ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git" -ARG DEEPGEMM_GIT_REF="187656694f7f69e3e7975617a68bc3387680a7e1" +ARG DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c" RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' . /etc/environment CUDA_MAJOR="${CUDA_VERSION%%.*}" From 45c3936e945ee1b869911f155d5519f2b60ce9d1 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 13 Aug 2025 01:12:26 +0100 Subject: [PATCH 212/932] [Docs] Hide the navigation and toc sidebars on home page (#22749) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/README.md b/docs/README.md index e8d2fd953a..683e1d3756 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,3 +1,9 @@ +--- +hide: + - navigation + - toc +--- + # Welcome to vLLM
From d0a63015888f5d5ab33e369bfa5ede4c8e0faea7 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 13 Aug 2025 01:12:30 +0100 Subject: [PATCH 213/932] Fix Transformers backend tensor parallel for multimodal models (#22673) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/transformers.py | 49 +++++++++++++++------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 25b8b69e08..4ec2b683fc 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -505,30 +505,47 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): Apply the model's tensor parallelization plan. Currently only supports linear layers. """ - tp_plan = getattr(self.model.config, "base_model_tp_plan", None) or {} + # Look for tp plans in all of the PreTrainedModels found in self.model + is_pretrained_model = lambda m: isinstance(m, PreTrainedModel) + supports_tp_plan = lambda m: m.config.base_model_tp_plan is not None + pretrained_models = filter(is_pretrained_model, self.model.modules()) + models_with_tp_plan = filter(supports_tp_plan, pretrained_models) - if not tp_plan and self.tp_size > 1: + if not any(models_with_tp_plan) and self.tp_size > 1: raise ValueError( f"{type(self.model)} does not support tensor parallel yet!") - # Some weight loaders expect linear layers to inherit from vLLM's - # LinearBase class, so we set a default style which causes any - # unspecified linear layers to be replaced with ReplicatedLinear - tp_plan[".*"] = "replicate" + def _tensor_parallel(module: nn.Module, + prefix: str = "", + tp_plan=None): + tp_plan = tp_plan or {} - def _tensor_parallel(module: nn.Module, prefix: str = ""): + # If the current module is a PreTrainedModel, set the tp_plan for + # all of its children + if isinstance(module, PreTrainedModel): + tp_plan = module.config.base_model_tp_plan or {} + tp_plan = { + maybe_prefix(prefix, k): v + for k, v in tp_plan.items() + } + + # Some weight loaders expect linear layers to inherit from vLLM's + # LinearBase class, so we set a default style which causes any + # unspecified linear layers to be replaced with ReplicatedLinear for child_name, child_module in module.named_children(): qual_name = maybe_prefix(prefix, child_name) - for pattern, style in tp_plan.items(): - if re.match(pattern, qual_name) and isinstance( - child_module, nn.Linear): - new_module = replace_linear_class( - child_module, style, self.quant_config) - setattr(module, child_name, new_module) - log_replacement(qual_name, child_module, new_module) - break + if isinstance(child_module, nn.Linear): + generator = (p for p in tp_plan if re.match(p, qual_name)) + pattern = next(generator, None) + style = tp_plan.get(pattern, "replicate") + new_module = replace_linear_class(child_module, style, + self.quant_config) + setattr(module, child_name, new_module) + log_replacement(qual_name, child_module, new_module) else: - _tensor_parallel(child_module, prefix=qual_name) + _tensor_parallel(child_module, + prefix=qual_name, + tp_plan=tp_plan) _tensor_parallel(self.model) From fde0b611a37e442cb8a53999a1cce48d76f49c16 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 13 Aug 2025 08:13:17 +0800 Subject: [PATCH 214/932] [Model] Decouple glm4v (#22751) Signed-off-by: Jee Jee Li --- docs/models/supported_models.md | 2 +- vllm/model_executor/models/glm4_1v.py | 26 +++++++++++++++++++++----- vllm/model_executor/models/registry.py | 2 +- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index a24fa4bcce..dbbbc5122b 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -615,7 +615,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | | `GLM4VForCausalLM`^ | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 7983895687..2a89c03bfe 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1227,10 +1227,7 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, "k_proj", "v_proj", ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], + "gate_up_proj": ["gate_up_proj"] } # To ensure correct weight loading and mapping. @@ -1567,7 +1564,26 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, Get the module prefix in multimodal models """ return MultiModelKeys.from_string_field( - language_model="language_model", + language_model="language_model.model", connector="visual.merger.", tower_model="visual.", ) + + +@MULTIMODAL_REGISTRY.register_processor( + Glm4vMultiModalProcessor, + info=Glm4vProcessingInfo, + dummy_inputs=Glm4vDummyInputsBuilder, +) +class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 64dbde4916..b817615b43 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -208,7 +208,7 @@ _MULTIMODAL_MODELS = { "Gemma3nForConditionalGeneration": ("gemma3n_mm", "Gemma3nForConditionalGeneration"), # noqa: E501 "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"), "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501 - "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501 + "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"), # noqa: E501 "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"), # noqa: E501 "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"), From e18859298d109870b22cb5b8672d1078818e268d Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 12 Aug 2025 20:14:46 -0400 Subject: [PATCH 215/932] Add hardware plugins to installation doc (#22732) Signed-off-by: Michael Goin Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/getting_started/installation/README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md index a252343dce..f6ecceb85d 100644 --- a/docs/getting_started/installation/README.md +++ b/docs/getting_started/installation/README.md @@ -14,3 +14,16 @@ vLLM supports the following hardware platforms: - [Google TPU](google_tpu.md) - [Intel Gaudi](intel_gaudi.md) - [AWS Neuron](aws_neuron.md) + +## Hardware Plugins + +The backends below live **outside** the main `vllm` repository and follow the +[Hardware-Pluggable RFC](../design/plugin_system.md). + +| Accelerator | PyPI / package | Repository | +|-------------|----------------|------------| +| Ascend NPU | `vllm-ascend` | | +| Intel Gaudi (HPU) | N/A, install from source | | +| MetaX MACA GPU | N/A, install from source | | +| Rebellions ATOM / REBEL NPU | `vllm-rbln` | | +| IBM Spyre AIU | `vllm-spyre` | | From 71683ca6f6764f35abe23d612a0d7dbd33babe32 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 12 Aug 2025 20:18:39 -0700 Subject: [PATCH 216/932] [V0 Deprecation] Remove multi-step scheduling (#22138) Signed-off-by: Woosuk Kwon Signed-off-by: Woosuk Kwon --- .../tests/genai-perf-tests.json | 1 - .../tests/nightly-tests.json | 6 - .buildkite/test-pipeline.yaml | 22 - .github/CODEOWNERS | 1 - tests/async_engine/test_async_llm_engine.py | 409 -------- tests/config/test_config.yaml | 1 - tests/config/test_config_with_model.yaml | 1 - tests/core/test_chunked_prefill_scheduler.py | 10 +- tests/core/test_num_computed_tokens_update.py | 24 +- .../test_multi_step_output_processor.py | 274 ------ .../openai/correctness/test_lmeval.py | 3 - tests/metrics/test_metrics.py | 39 - .../models/language/generation/test_hybrid.py | 26 - .../multi_step/test_correctness_async_llm.py | 232 ----- tests/multi_step/test_correctness_llm.py | 383 -------- tests/samplers/test_logits_processor.py | 70 -- tests/tpu/lora/test_lora.py | 1 - tests/utils_/test_utils.py | 2 - tests/v1/test_oracle.py | 6 - tests/worker/test_model_input.py | 79 -- vllm/config/__init__.py | 2 - vllm/core/scheduler.py | 92 +- vllm/engine/arg_utils.py | 43 +- vllm/engine/async_llm_engine.py | 26 +- vllm/engine/llm_engine.py | 178 +--- vllm/engine/output_processor/interfaces.py | 26 +- vllm/engine/output_processor/multi_step.py | 211 ---- vllm/platforms/cuda.py | 14 +- vllm/platforms/rocm.py | 14 +- vllm/platforms/tpu.py | 7 +- vllm/sequence.py | 38 - vllm/worker/model_runner.py | 7 +- vllm/worker/multi_step_model_runner.py | 908 ------------------ vllm/worker/multi_step_neuron_model_runner.py | 84 -- ...i_step_neuronx_distributed_model_runner.py | 63 -- vllm/worker/multi_step_worker.py | 197 ---- vllm/worker/neuron_worker.py | 22 +- 37 files changed, 57 insertions(+), 3465 deletions(-) delete mode 100644 tests/async_engine/test_async_llm_engine.py delete mode 100644 tests/engine/test_multi_step_output_processor.py delete mode 100644 tests/multi_step/test_correctness_async_llm.py delete mode 100644 tests/multi_step/test_correctness_llm.py delete mode 100644 tests/samplers/test_logits_processor.py delete mode 100644 vllm/engine/output_processor/multi_step.py delete mode 100644 vllm/worker/multi_step_model_runner.py delete mode 100644 vllm/worker/multi_step_neuron_model_runner.py delete mode 100644 vllm/worker/multi_step_neuronx_distributed_model_runner.py delete mode 100644 vllm/worker/multi_step_worker.py diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json index f26ae7634f..afb844880f 100644 --- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json +++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json @@ -12,7 +12,6 @@ "vllm_server_parameters": { "disable_log_stats": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 41b4a40088..423a3bfe12 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -36,7 +36,6 @@ "vllm_server_parameters": { "disable_log_stats": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, @@ -90,7 +89,6 @@ "vllm_server_parameters": { "disable_log_stats": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, @@ -144,7 +142,6 @@ "vllm_server_parameters": { "disable_log_stats": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, @@ -195,7 +192,6 @@ "vllm_server_parameters": { "disable_log_stats": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, @@ -248,7 +244,6 @@ "vllm_server_parameters": { "disable_log_stats": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, @@ -301,7 +296,6 @@ "vllm_server_parameters": { "disable_log_stats": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ebcf51981e..740be2bc87 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -67,7 +67,6 @@ steps: - python3 standalone_tests/lazy_imports.py - pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s async_engine # AsyncLLMEngine - - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py - pytest -v -s test_inputs.py - pytest -v -s test_outputs.py - pytest -v -s multimodal @@ -773,27 +772,6 @@ steps: - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins -- label: Multi-step Tests (4 GPUs) # 36min - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/model_executor/layers/sampler.py - - vllm/sequence.py - - vllm/worker/worker_base.py - - vllm/worker/worker.py - - vllm/worker/multi_step_worker.py - - vllm/worker/model_runner_base.py - - vllm/worker/model_runner.py - - vllm/worker/multi_step_model_runner.py - - vllm/engine - - tests/multi_step - commands: - # this test is quite flaky - # TODO: investigate and fix. - # - pytest -v -s multi_step/test_correctness_async_llm.py - - pytest -v -s multi_step/test_correctness_llm.py - - label: Pipeline Parallelism Test # 45min mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index a0a327319a..b0dd5e99d4 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -36,7 +36,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm /tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256 /tests/models @DarkLight1337 @ywang96 -/tests/multi_step @alexm-redhat @comaniac /tests/multimodal @DarkLight1337 @ywang96 /tests/prefix_caching @comaniac @KuntaiDu /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py deleted file mode 100644 index 0eb7a6eb52..0000000000 --- a/tests/async_engine/test_async_llm_engine.py +++ /dev/null @@ -1,409 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -import os -import uuid -from asyncio import CancelledError -from copy import copy -from dataclasses import dataclass, field -from typing import Any, Optional - -import pytest -import pytest_asyncio -import torch - -from vllm import SamplingParams -from vllm.config import ParallelConfig -from vllm.distributed import cleanup_dist_env_and_memory -from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine -from vllm.outputs import RequestOutput as RealRequestOutput -from vllm.sampling_params import RequestOutputKind - -from ..utils import wait_for_gpu_memory_to_clear - - -@dataclass -class RequestOutput: - request_id: int - finished: bool = False - - -@dataclass -class MockModelConfig: - use_async_output_proc = True - media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - - -class MockEngine: - - def __init__(self): - self.step_calls = 0 - self.add_request_calls = 0 - self.abort_request_calls = 0 - self.request_id = None - # Ugly, remove dependency when possible - self.parallel_config = ParallelConfig() - self.model_config = MockModelConfig() - - async def step_async(self, virtual_engine): - # PP size is 1, ignore virtual engine - self.step_calls += 1 - return [RequestOutput( - request_id=self.request_id)] if self.request_id else [] - - async def process_model_inputs_async(self, *args, **kwargs): - pass - - async def stop_remote_worker_execution_loop_async(self): - pass - - def generate(self, request_id): - self.request_id = request_id - - def stop_generating(self): - self.request_id = None - - def add_request(self, **kwargs): - del kwargs # Unused - self.add_request_calls += 1 - print(f'Request calls: {self.add_request_calls}') - - async def add_request_async(self, **kwargs): - self.add_request_calls += 1 - return - - def abort_request(self, request_id): - del request_id # Unused - self.abort_request_calls += 1 - - def has_unfinished_requests(self): - return self.request_id is not None - - def has_unfinished_requests_for_virtual_engine(self, virtual_engine): - return self.request_id is not None - - -class MockAsyncLLMEngine(AsyncLLMEngine): - _engine_class = MockEngine - - -@pytest.mark.asyncio -async def test_new_requests_event(): - params = SamplingParams() - - engine = MockAsyncLLMEngine() - engine.start_background_loop() - await asyncio.sleep(0.01) - assert engine.engine.step_calls == 0 - - await engine.add_request("1", "", params) - await asyncio.sleep(0.01) - assert engine.engine.add_request_calls == 1 - assert engine.engine.step_calls == 1 - - await engine.add_request("2", "", params) - engine.engine.generate("2") - await asyncio.sleep(0) - await asyncio.sleep(0) - await asyncio.sleep(0) - assert engine.engine.add_request_calls == 2 - assert engine.engine.step_calls >= 2 - await asyncio.sleep(0.001) - assert engine.engine.step_calls >= 3 - engine.engine.stop_generating() - await asyncio.sleep(0.001) - old_step_calls = engine.engine.step_calls - await asyncio.sleep(0.001) - assert engine.engine.step_calls == old_step_calls - - await engine.add_request("3", "", params) - await asyncio.sleep(0.01) - assert engine.engine.add_request_calls == 3 - assert engine.engine.step_calls == old_step_calls + 1 - await asyncio.sleep(0.01) - assert engine.engine.add_request_calls == 3 - assert engine.engine.step_calls == old_step_calls + 1 - - engine = MockAsyncLLMEngine() - assert engine.get_model_config() is not None - assert engine.get_tokenizer() is not None - assert engine.get_decoding_config() is not None - - -def start_engine(): - wait_for_gpu_memory_to_clear( - devices=list(range(torch.cuda.device_count())), - threshold_bytes=2 * 2**30, - timeout_s=60, - ) - - num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1")) - print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}") - - return AsyncLLMEngine.from_engine_args( - AsyncEngineArgs(model="facebook/opt-125m", - enforce_eager=True, - num_scheduler_steps=num_scheduler_steps)) - - -def uid() -> str: - return str(uuid.uuid4()) - - -@pytest_asyncio.fixture(scope="module") -async def async_engine(): - # We cannot use monkeypatch since this is a module - # scoped fixture and monkeypatch is function scoped. - previous_value = os.getenv("VLLM_USE_V1", None) - os.environ["VLLM_USE_V1"] = "0" - engine = await asyncio.get_event_loop().run_in_executor(executor=None, - func=start_engine) - try: - yield engine - finally: - engine.shutdown_background_loop() - del engine - await asyncio.sleep(0.1) - cleanup_dist_env_and_memory() - - if previous_value: - os.environ["VLLM_USE_V1"] = previous_value - else: - del os.environ["VLLM_USE_V1"] - - -@pytest.fixture() -def should_do_global_cleanup_after_test(request) -> bool: - # So we can share the async engine fixture between these tests - return False - - -@pytest.mark.asyncio(scope="module") -@pytest.mark.parametrize("stop", [None, ["a stop string"]]) -async def test_asyncio_run(async_engine, stop): - - scheduler_config = await async_engine.get_scheduler_config() - num_scheduler_steps = scheduler_config.num_scheduler_steps - - async def run(prompt: str): - sampling_params = SamplingParams( - temperature=0, - max_tokens=32, - min_tokens=32, - stop=stop, - ) - - output_count = 0 - final_output = None - async for output in async_engine.generate(prompt, - sampling_params, - request_id=uid()): - output_count += 1 - final_output = output - return final_output, output_count - - results = await asyncio.gather( - run("test0"), - run("test0"), - ) - assert len(results) == 2 - first, second = results - - # remove nondeterministic fields for comparison - first[0].metrics = None - second[0].metrics = None - first[0].request_id = None - second[0].request_id = None - - assert str(first) == str(second) - - output_count = results[0][1] - if num_scheduler_steps == 1: - assert output_count == 32 - else: - assert 1 < output_count < 32 - - -@pytest.mark.asyncio(scope="module") -@pytest.mark.parametrize("stop", [None, ["a stop string"]]) -async def test_output_kinds(async_engine, stop): - """Test that output_kind works as expected and that - results are equivalent across different kinds.""" - - scheduler_config = await async_engine.get_scheduler_config() - num_scheduler_steps = scheduler_config.num_scheduler_steps - - sampling_params = SamplingParams( - temperature=0, - max_tokens=32, - min_tokens=32, - stop=stop, - ) - - async def run(prompt: str, kind: RequestOutputKind): - params = copy(sampling_params) - params.output_kind = kind - - output_count = 0 - final_output = None - async for output in async_engine.generate(prompt, - params, - request_id=uid()): - output_count += 1 - final_output = output - - assert final_output is not None - assert final_output.finished - - return (final_output.prompt_token_ids, - final_output.outputs[0].token_ids, - final_output.outputs[0].text, output_count) - - async def run_deltas(prompt: str): - params = copy(sampling_params) - params.output_kind = RequestOutputKind.DELTA - - prompt_tokens = None - output_tokens: list[int] = [] - output_text = "" - output_count = 0 - final_output = None - async for output in async_engine.generate(prompt, - params, - request_id=uid()): - token_ids = output.outputs[0].token_ids - text = output.outputs[0].text - final_output = output - - # Ensure we get prompt ids iff we haven't yet received output tokens - if output_tokens: - assert 1 <= len(token_ids) <= num_scheduler_steps - assert stop or text - assert not output.prompt_token_ids - else: - assert output.prompt_token_ids - prompt_tokens = output.prompt_token_ids - - output_tokens.extend(token_ids) - output_text += text - - output_count += 1 - - assert final_output is not None - assert final_output.finished - - return prompt_tokens, output_tokens, output_text, output_count - - results = await asyncio.gather( - run("common input prompt", RequestOutputKind.CUMULATIVE), - run("common input prompt", RequestOutputKind.FINAL_ONLY), - run_deltas("common input prompt")) - - # Make sure outputs are the same - prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results) - assert len(prompt_set) == 1 - - text_set = set(text for _, _, text, _ in results) - assert len(text_set) == 1 - - tokens_set = set(tuple(ids) for _, ids, _, _ in results) - assert len(tokens_set) == 1 - - cumulative, final, deltas = results - - # output message counts - assert cumulative[3] == deltas[3] - - if num_scheduler_steps == 1: - assert cumulative[3] == 32 - else: - assert 1 < cumulative[3] < 32 - - assert final[3] == 1 - - -@pytest.mark.asyncio(scope="module") -@pytest.mark.parametrize("stop", [None, ["a stop string"]]) -async def test_cancellation(async_engine, stop): - scheduler_config = await async_engine.get_scheduler_config() - num_scheduler_steps = scheduler_config.num_scheduler_steps - - sampling_params = SamplingParams( - temperature=0, - min_tokens=13, - max_tokens=13, - stop=stop, - ) - - stop_at = 5 if num_scheduler_steps == 1 else 1 - - request_id = uid() - - i = 0 - with pytest.raises(CancelledError): - async for output in async_engine.generate("test2", - sampling_params, - request_id=request_id): - assert not output.finished - i += 1 - if i == stop_at: - await async_engine.abort(request_id) - - assert i == stop_at - - -@pytest.mark.asyncio(scope="module") -@pytest.mark.parametrize("stop", [None, ["a stop string"]]) -async def test_delayed_generator(async_engine, stop): - scheduler_config = await async_engine.get_scheduler_config() - - if scheduler_config.num_scheduler_steps != 1: - pytest.skip("no need to test this one with multistep") - - sampling_params = SamplingParams( - temperature=0, - min_tokens=10, - max_tokens=10, - stop=stop, - ) - - stream = async_engine.generate("test3", sampling_params, request_id=uid()) - i = 0 - final_output: Optional[RealRequestOutput] = None - async for output in stream: - final_output = output - if i == 0: - # wait for generation to complete before consuming - # the remaining messages - await asyncio.sleep(1) - if i < 9: - assert not output.finished - i += 1 - - assert i == 10 - assert final_output is not None - assert len(final_output.outputs[0].token_ids) == 10 - assert final_output.finished - - -@pytest.mark.asyncio(scope="module") -async def test_invalid_argument(async_engine): - scheduler_config = await async_engine.get_scheduler_config() - - if scheduler_config.num_scheduler_steps != 1: - pytest.skip("no need to test this one with multistep") - - sampling_params = SamplingParams( - temperature=0, - min_tokens=10, - max_tokens=10, - ) - - # Targeting specific DP rank only supported in v1 multi-instance DP - with pytest.raises(ValueError): - async for _ in async_engine.generate("test", - sampling_params, - request_id=uid(), - data_parallel_rank=0): - pass diff --git a/tests/config/test_config.yaml b/tests/config/test_config.yaml index 5090e8f357..a16857b5f2 100644 --- a/tests/config/test_config.yaml +++ b/tests/config/test_config.yaml @@ -2,4 +2,3 @@ port: 12312 served_model_name: mymodel tensor_parallel_size: 2 trust_remote_code: true -multi_step_stream_outputs: false diff --git a/tests/config/test_config_with_model.yaml b/tests/config/test_config_with_model.yaml index d8c8c7bc81..9fbdb77d4e 100644 --- a/tests/config/test_config_with_model.yaml +++ b/tests/config/test_config_with_model.yaml @@ -4,4 +4,3 @@ port: 12312 served_model_name: mymodel tensor_parallel_size: 2 trust_remote_code: true -multi_step_stream_outputs: false diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index d4dacc4f12..ce1fe189b3 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -644,11 +644,9 @@ def test_chunked_prefill_preempt(): assert out.num_batched_tokens == max_num_batched_tokens -@pytest.mark.parametrize("num_scheduler_steps", [1, 5]) -def test_chunked_prefill_spec_prefill(num_scheduler_steps): +def test_chunked_prefill_spec_prefill(): """Verify that the num_lookahead_slots is set appropriately for an all""" - """prefill batch depending on whether multi-step scheduling is enabled""" - """or not""" + """prefill batch.""" block_size = 4 max_seqs = 30 max_model_len = 200 @@ -661,7 +659,6 @@ def test_chunked_prefill_spec_prefill(num_scheduler_steps): max_model_len, enable_chunked_prefill=True, num_lookahead_slots=num_lookahead_slots, - num_scheduler_steps=num_scheduler_steps, ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 16 @@ -679,8 +676,7 @@ def test_chunked_prefill_spec_prefill(num_scheduler_steps): assert out.num_prefill_groups == 1 assert out.num_batched_tokens == max_num_batched_tokens print(out.num_lookahead_slots) - assert out.num_lookahead_slots == (0 if (num_scheduler_steps == 1) else - num_lookahead_slots) + assert out.num_lookahead_slots == 0 def test_chunked_prefill_max_seqs(): diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py index 9e1b7913df..131a7b3a62 100644 --- a/tests/core/test_num_computed_tokens_update.py +++ b/tests/core/test_num_computed_tokens_update.py @@ -6,7 +6,6 @@ import pytest from tests.conftest import VllmRunner from tests.core.utils import create_dummy_prompt from vllm.engine.llm_engine import LLMEngine -from vllm.platforms import current_platform from vllm.sequence import SequenceGroup MODEL = "JackFram/llama-160m" @@ -17,32 +16,19 @@ def add_seq_group_to_engine(engine: LLMEngine, seq_group: SequenceGroup): scheduler.add_seq_group(seq_group) -@pytest.mark.parametrize("num_scheduler_steps", [1, 8]) @pytest.mark.parametrize("enable_chunked_prefill", [False, True]) @pytest.mark.parametrize("enforce_eager", [False, True]) -def test_num_computed_tokens_update(num_scheduler_steps: int, - enable_chunked_prefill: bool, +def test_num_computed_tokens_update(enable_chunked_prefill: bool, enforce_eager: bool): - is_multi_step = num_scheduler_steps > 1 - is_multi_step_chunked_prefill = is_multi_step and enable_chunked_prefill - - if is_multi_step_chunked_prefill and current_platform.is_rocm(): - pytest.skip("Multi-step with Chunked-Prefill does not support " - "rocm_flash_attn backend") - # Make a vllm engine runner = VllmRunner(model_name=MODEL, gpu_memory_utilization=0.7, - num_scheduler_steps=num_scheduler_steps, enable_chunked_prefill=enable_chunked_prefill, enforce_eager=enforce_eager) engine: LLMEngine = runner.llm.llm_engine - # In multi-step + chunked-prefill there is no separate single prompt step. - # What is scheduled will run for num_scheduler_steps always. - num_prompt_steps = num_scheduler_steps \ - if is_multi_step_chunked_prefill else 1 + num_prompt_steps = 1 num_output_tokens_list = [4, 8, 12, 15, 16, 17] @@ -73,10 +59,8 @@ def test_num_computed_tokens_update(num_scheduler_steps: int, # Test correctness of num_computed_tokens after the decode steps assert seq.data.get_num_computed_tokens( ) == prompt_num_computed_tokens + decode_step_counter - for _ in range(num_scheduler_steps): - # decode step - engine.step() - decode_step_counter += 1 + engine.step() + decode_step_counter += 1 # Test correctness of num_computed_tokens after the sequence finish. assert seq.data.get_num_computed_tokens( diff --git a/tests/engine/test_multi_step_output_processor.py b/tests/engine/test_multi_step_output_processor.py deleted file mode 100644 index 458f4deb74..0000000000 --- a/tests/engine/test_multi_step_output_processor.py +++ /dev/null @@ -1,274 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import random -from unittest.mock import MagicMock - -import pytest -from transformers import PreTrainedTokenizer - -from vllm.core.scheduler import Scheduler -from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor -from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.sampling_params import SamplingParams -from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, - SequenceOutput, SequenceStatus) -from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.utils import Counter - -from ..core.utils import create_seq_group - - -@pytest.mark.parametrize("seq_output_len", [128]) -@pytest.mark.parametrize("num_new_tokens", [1, 12]) -@pytest.mark.skip_global_cleanup -def test_appends_token_ids(num_new_tokens: int, seq_output_len: int): - """Verify multi-step decoding appends token ids correctly. - - We append token ids and verify all the token ids were appended correctly. - Note that ignore_eos=True. - """ - detokenizer = MagicMock(spec=Detokenizer) - scheduler = MagicMock(spec=Scheduler) - stop_checker = MagicMock(spec=StopChecker) - seq_counter = Counter() - - output_processor = MultiStepOutputProcessor( - detokenizer=detokenizer, - scheduler=[scheduler], - seq_counter=seq_counter, - get_tokenizer_for_seq=lambda _: mock_tokenizer(), - stop_checker=stop_checker, - ) - - seq_group = create_seq_group( - seq_prompt_len=1024, - seq_output_lens=[seq_output_len], - sampling_params=SamplingParams(max_tokens=seq_output_len + - num_new_tokens, - ignore_eos=True), - ) - - seq = seq_group.get_seqs()[0] - seq.status = SequenceStatus.RUNNING - - new_token_ids = list(range(num_new_tokens)) - - outputs = [ - CompletionSequenceGroupOutput( - samples=[ - SequenceOutput( - parent_seq_id=seq.seq_id, - output_token=output_token, - logprobs={output_token: Logprob(0.0)}, - ) - ], - prompt_logprobs=None, - ) for output_token in new_token_ids - ] - - assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids - output_processor.process_outputs(seq_group, outputs) - assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids - - -@pytest.mark.parametrize("seq_prompt_len", [1024]) -@pytest.mark.parametrize("seq_output_len", [128]) -@pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8]) -@pytest.mark.parametrize("max_tokens", [128 + 3]) -@pytest.mark.skip_global_cleanup -def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, - seq_output_len: int, max_tokens: int): - """Verify tokens after max_tokens are dropped and not appended to the - sequence. - """ - detokenizer = MagicMock(spec=Detokenizer) - scheduler = MagicMock(spec=Scheduler) - stop_checker = MagicMock(spec=StopChecker) - seq_counter = Counter() - - output_processor = MultiStepOutputProcessor( - detokenizer=detokenizer, - scheduler=[scheduler], - seq_counter=seq_counter, - get_tokenizer_for_seq=lambda _: mock_tokenizer(), - stop_checker=stop_checker, - ) - - seq_group = create_seq_group( - seq_prompt_len=seq_prompt_len, - seq_output_lens=[seq_output_len], - sampling_params=SamplingParams(max_tokens=max_tokens, ), - ) - - seq = seq_group.get_seqs()[0] - seq.status = SequenceStatus.RUNNING - - new_token_ids = list(range(num_new_tokens)) - - outputs = [ - CompletionSequenceGroupOutput( - samples=[ - SequenceOutput( - parent_seq_id=seq.seq_id, - output_token=output_token, - logprobs={output_token: Logprob(0.0)}, - ) - ], - prompt_logprobs=None, - ) for output_token in new_token_ids - ] - - assert seq.get_len() == seq_prompt_len + seq_output_len - output_processor.process_outputs(seq_group, outputs) - - # Expect the processed sequence to not go over max tokens in len. - assert seq.get_len() == seq_prompt_len + max_tokens - - # Expect the correct tokens were appended. - expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len] - assert seq.get_token_ids( - )[-len(expected_appended_tokens):] == expected_appended_tokens - - -@pytest.mark.parametrize("seq_prompt_len", [1024]) -@pytest.mark.parametrize("seq_output_len", [128]) -@pytest.mark.parametrize("num_new_tokens", [12]) -@pytest.mark.parametrize("seed", list(range(6))) -@pytest.mark.skip_global_cleanup -def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, - seq_output_len: int, seed: int): - """Verify the eos token id is included in the sequence, but subsequent - tokens are dropped (not appended to sequence). - """ - random.seed(seed) - detokenizer = MagicMock(spec=Detokenizer) - scheduler = MagicMock(spec=Scheduler) - stop_checker = MagicMock(spec=StopChecker) - seq_counter = Counter() - - eos_token_id = 100 - - output_processor = MultiStepOutputProcessor( - detokenizer=detokenizer, - scheduler=[scheduler], - seq_counter=seq_counter, - get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id), - stop_checker=stop_checker, - ) - - seq_group = create_seq_group( - seq_prompt_len=seq_prompt_len, - seq_output_lens=[seq_output_len], - sampling_params=SamplingParams( - # Ensure enough space. - max_tokens=seq_output_len + num_new_tokens, ), - ) - - seq = seq_group.get_seqs()[0] - seq.status = SequenceStatus.RUNNING - - new_token_ids = list(range(num_new_tokens)) - assert eos_token_id not in new_token_ids - eos_index = random.randint(0, len(new_token_ids) - 1) - new_token_ids[eos_index] = eos_token_id - - outputs = [ - CompletionSequenceGroupOutput( - samples=[ - SequenceOutput( - parent_seq_id=seq.seq_id, - output_token=output_token, - logprobs={output_token: Logprob(0.0)}, - ) - ], - prompt_logprobs=None, - ) for output_token in new_token_ids - ] - - assert seq.get_len() == seq_prompt_len + seq_output_len - output_processor.process_outputs(seq_group, outputs) - - # Expect the processed sequence to not go beyond provided eos. - assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1) - - # Expect the correct tokens were appended. - expected_appended_tokens = new_token_ids[:eos_index + 1] - assert seq.get_token_ids( - )[-len(expected_appended_tokens):] == expected_appended_tokens - - -@pytest.mark.parametrize("seq_prompt_len", [1024]) -@pytest.mark.parametrize("seq_output_len", [128]) -@pytest.mark.parametrize("num_new_tokens", [12]) -@pytest.mark.parametrize("seed", list(range(6))) -@pytest.mark.skip_global_cleanup -def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, - seq_output_len: int, seed: int): - """When sampling parameters dictate that we should ignore the eos token id, - ensure all token ids are appended even if the eos token id is emitted. - """ - random.seed(seed) - detokenizer = MagicMock(spec=Detokenizer) - scheduler = MagicMock(spec=Scheduler) - stop_checker = MagicMock(spec=StopChecker) - seq_counter = Counter() - - eos_token_id = 100 - - output_processor = MultiStepOutputProcessor( - detokenizer=detokenizer, - scheduler=[scheduler], - seq_counter=seq_counter, - get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id), - stop_checker=stop_checker, - ) - - seq_group = create_seq_group( - seq_prompt_len=seq_prompt_len, - seq_output_lens=[seq_output_len], - sampling_params=SamplingParams( - # Ensure enough space. - max_tokens=seq_output_len + num_new_tokens, - ignore_eos=True, - ), - ) - - seq = seq_group.get_seqs()[0] - seq.status = SequenceStatus.RUNNING - - new_token_ids = list(range(num_new_tokens)) - assert eos_token_id not in new_token_ids - eos_index = random.randint(0, len(new_token_ids) - 1) - new_token_ids[eos_index] = eos_token_id - - outputs = [ - CompletionSequenceGroupOutput( - samples=[ - SequenceOutput( - parent_seq_id=seq.seq_id, - output_token=output_token, - logprobs={output_token: Logprob(0.0)}, - ) - ], - prompt_logprobs=None, - ) for output_token in new_token_ids - ] - - assert seq.get_len() == seq_prompt_len + seq_output_len - output_processor.process_outputs(seq_group, outputs) - - # Expect the processed sequence to go beyond eos. - assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens - - # Expect the correct tokens were appended. - expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens - - seq_output_len] - assert seq.get_token_ids( - )[-len(expected_appended_tokens):] == expected_appended_tokens - - -def mock_tokenizer(eos_token_id=1000): - tokenizer = MagicMock(spec=PreTrainedTokenizer) - tokenizer.eos_token_id = eos_token_id - return tokenizer diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py index d75731637d..684407cd6e 100644 --- a/tests/entrypoints/openai/correctness/test_lmeval.py +++ b/tests/entrypoints/openai/correctness/test_lmeval.py @@ -26,15 +26,12 @@ DEFAULT_ARGS = ["--max-model-len", "4096"] MORE_ARGS_LIST = [ [], # Default ["--enable-chunked-prefill"], # Chunked - ["--num-scheduler-steps", "8"], # MS - ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"] # MS+Stream ] MAX_WAIT_SECONDS = None if current_platform.is_tpu(): MORE_ARGS_LIST = [ [], # Default - # ["--num-scheduler-steps", "8"], # Multi-step << currently fails ] MAX_WAIT_SECONDS = 600 diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 8cae8a80d3..dbd9c518e0 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -94,45 +94,6 @@ def test_metric_counter_generation_tokens( f"metric: {metric_count!r}") -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("max_tokens", [128, 129]) -@pytest.mark.parametrize("disable_async_output_proc", [True, False]) -def test_metric_counter_generation_tokens_multi_step( - vllm_runner, - example_prompts, - model: str, - max_tokens: int, - disable_async_output_proc: bool, -) -> None: - num_scheduler_steps = 8 - with vllm_runner( - model, - disable_log_stats=False, - gpu_memory_utilization=0.4, - num_scheduler_steps=num_scheduler_steps, - disable_async_output_proc=disable_async_output_proc, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - tokenizer = vllm_model.llm.get_tokenizer() - stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] - metric_count = stat_logger.metrics.counter_generation_tokens.labels( - **stat_logger.labels)._value.get() - vllm_generation_count = 0 - for i in range(len(example_prompts)): - vllm_output_ids, vllm_output_str = vllm_outputs[i] - prompt_ids = tokenizer.encode(example_prompts[i]) - # vllm_output_ids contains both prompt tokens and generation tokens. - # We're interested only in the count of the generation tokens. - vllm_generation_count += len(vllm_output_ids) - len(prompt_ids) - - # The multi-step scheduling will continue to execute forward even when - # encountering EOS, leading to slightly imprecise metrics. - assert abs(vllm_generation_count - metric_count) <\ - len(example_prompts) * num_scheduler_steps, \ - (f"generation token count: {vllm_generation_count!r}\n" - f"metric: {metric_count!r}") - - @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize( diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 76f6c226ba..19fcbf5616 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -331,32 +331,6 @@ def test_state_cleanup( "could be related to finished_requests_ids") -@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]]) -@pytest.mark.parametrize("max_tokens", [64]) -def test_multistep_correctness( - vllm_runner, - example_prompts, - model: str, - max_tokens: int, -) -> None: - with vllm_runner(model, num_scheduler_steps=8, - max_num_seqs=2) as vllm_model: - vllm_outputs_multistep = vllm_model.generate_greedy( - example_prompts, max_tokens) - - with vllm_runner(model, num_scheduler_steps=1, - max_num_seqs=2) as vllm_model: - vllm_outputs_single_step = vllm_model.generate_greedy( - example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=vllm_outputs_multistep, - outputs_1_lst=vllm_outputs_single_step, - name_0="vllm_outputs_multistep", - name_1="vllm_outputs_single_step", - ) - - @multi_gpu_test(num_gpus=2) @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]]) @pytest.mark.parametrize("max_tokens", [64]) diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py deleted file mode 100644 index 56e339d485..0000000000 --- a/tests/multi_step/test_correctness_async_llm.py +++ /dev/null @@ -1,232 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Test the AsyncLLMEngine with multi-step-decoding -from typing import Optional - -import pytest - -from vllm.utils import STR_BACKEND_ENV_VAR - -from ..models.utils import check_logprobs_close -from ..utils import (completions_with_server_args, get_client_text_generations, - get_client_text_logprob_generations) - -MODELS = [ - "JackFram/llama-160m", -] -NUM_SCHEDULER_STEPS = [8] # Multi-step decoding steps -NUM_PROMPTS = [10] - -DEFAULT_SERVER_ARGS: list[str] = [ - "--distributed-executor-backend", - "ray", - "--gpu-memory-utilization", - "0.85", - "--swap-space", - "16", -] - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize(("tp_size, pp_size"), [ - (1, 1), - (2, 2), -]) -@pytest.mark.parametrize("eager_mode", [False, True]) -@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) -@pytest.mark.parametrize("num_prompts", NUM_PROMPTS) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("is_async", [True]) -@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) -@pytest.mark.parametrize("enable_chunked_prefill", [True, False]) -@pytest.mark.asyncio -async def test_multi_step( - example_prompts, - model: str, - tp_size: int, - pp_size: int, - eager_mode: int, - num_scheduler_steps: int, - num_prompts: int, - is_async: bool, - num_logprobs: Optional[int], - attention_backend: str, - enable_chunked_prefill: bool, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """Test vLLM engine with multi-step scheduling in an OpenAI-protocol - client/server environment. - - Set up an engine with single-step scheduling as a ground-truth reference. - - Send a completions API request to both engines with the same prompts. - - Validate: - * Generated tokens match - * Generated logprobs are all very close - - Args: - example_prompts: test fixture providing example prompts - model: model under test (same for single- and multi-step engines) - tp_size: degree of tensor-parallelism - pp_size: degree of pipeline-parallelism - eager_mode - num_scheduler_steps: for multi-step scheduling, GPU-side steps per - GPU -> CPU output transfer - num_prompts: number of example prompts under test - num_logprobs: corresponds to the `logprobs` argument to the OpenAI - completions endpoint; `None` -> no logprobs - """ - if enable_chunked_prefill and \ - (pp_size > 1 or attention_backend != "FLASH_ATTN"): - pytest.skip("Multi-step with Chunked-Prefill only supports" - "PP=1 and FLASH_ATTN backend") - - with monkeypatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - - prompts = example_prompts - if len(prompts) < num_prompts: - prompts = prompts * ((num_prompts // len(prompts)) + 1) - prompts = prompts[:num_prompts] - assert len(prompts) == num_prompts - - server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"] - ms_server_args = DEFAULT_SERVER_ARGS + \ - ["--num-scheduler-steps", f"{num_scheduler_steps}"] - - if not is_async: - ms_server_args += ["--disable-async-output-proc"] - - if eager_mode: - ms_server_args.append("--enforce-eager") - - if enable_chunked_prefill: - ms_server_args.append("--enable-chunked-prefill") - - distributed_args = [ - "--tensor-parallel-size", - str(tp_size), - "--pipeline-parallel-size", - str(pp_size), - ] - - # Spin up client/server & issue completion API requests. - # Default `max_wait_seconds` is 240 but was empirically - # was raised 5x to 1200 *just for this test* due to - # observed timeouts in GHA CI - ref_completions = await completions_with_server_args( - prompts, - model, - server_args + distributed_args, - num_logprobs, - max_wait_seconds=5 * 240) - test_completions = await completions_with_server_args( - prompts, - model, - ms_server_args + distributed_args, - num_logprobs, - max_wait_seconds=5 * 240) - - # Assert multi-step scheduling produces identical tokens - # to single-step scheduling. - ref_generations = get_client_text_generations(ref_completions) - test_generations = get_client_text_generations(test_completions) - assert ref_generations == test_generations - - # Assert multi-step scheduling produces nearly-identical logprobs - # to single-step scheduling. - ref_text_logprobs = get_client_text_logprob_generations( - ref_completions) - test_text_logprobs = get_client_text_logprob_generations( - test_completions) - check_logprobs_close( - outputs_0_lst=ref_text_logprobs, - outputs_1_lst=test_text_logprobs, - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize(("tp_size, pp_size"), [ - (1, 2), -]) -@pytest.mark.asyncio -async def test_multi_step_pp_smoke( - tp_size: int, - pp_size: int, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """ - Smoke test for the vLLM engine with multi-step scheduling in an - OpenAI-protocol client/server environment. - - This tests compares the outputs between multi-step scheduling and - single-step scheduling. Notably, this test lets the engines generate - more tokens (default is 5) and test for an exact match over all the - tokens. - - Args: - tp_size: degree of tensor-parallelism - pp_size: degree of pipeline-parallelism - eager_mode - """ - - model = "JackFram/llama-160m" - num_scheduler_steps = 8 - attention_backend = "FLASH_ATTN" - max_num_seqs = 3 - - with monkeypatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - - # Prompt from the ShareGPT dataset - prompts = [ - "in the jtbd context whats a push?", # codespell:ignore - "in the jtbd context whats a push?", # codespell:ignore - "in the jtbd context whats a push?", # codespell:ignore - "in the jtbd context whats a push?", # codespell:ignore - ] - # Use varying max_tokens to introduce scheduling randomness. - max_tokens = [10 * i for i in range(1, len(prompts) + 1)] - assert len(prompts) == len(max_tokens) - - test_args = [ - "--tensor-parallel-size", - str(tp_size), "--pipeline-parallel-size", - str(pp_size), "--max-num-seqs", - str(max_num_seqs) - ] - - server_args = DEFAULT_SERVER_ARGS + test_args - ms_server_args = DEFAULT_SERVER_ARGS + \ - ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \ - test_args - - # Spin up client/server & issue completion API requests. - # Default `max_wait_seconds` is 240 but was empirically - # was raised 3x to 720 *just for this test* due to - # observed timeouts in GHA CI - ref_completions = await completions_with_server_args( - prompts=prompts, - model_name=model, - server_cli_args=server_args, - num_logprobs=None, - max_wait_seconds=5 * 240, - max_tokens=max_tokens) - - test_completions = await completions_with_server_args( - prompts=prompts, - model_name=model, - server_cli_args=ms_server_args, - num_logprobs=None, - max_wait_seconds=5 * 240, - max_tokens=max_tokens) - - # Assert multi-step scheduling produces identical tokens - # to single-step scheduling. - ref_generations = get_client_text_generations(ref_completions) - test_generations = get_client_text_generations(test_completions) - - assert ref_generations == test_generations diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py deleted file mode 100644 index 0df00c98b7..0000000000 --- a/tests/multi_step/test_correctness_llm.py +++ /dev/null @@ -1,383 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Test the LLMEngine with multi-step-decoding - -import copy -from typing import Optional - -import pytest - -from vllm.platforms import current_platform -from vllm.utils import STR_BACKEND_ENV_VAR - -from ..models.utils import check_logprobs_close, check_outputs_equal - -MODELS = [ - "JackFram/llama-160m", -] -NUM_SCHEDULER_STEPS = [8] # Multi-step decoding steps -NUM_PROMPTS = [10] - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("tp_size", [1]) -@pytest.mark.parametrize("enable_chunked_prefill", [False, True]) -@pytest.mark.parametrize("max_tokens", [5]) -@pytest.mark.parametrize("enforce_eager", [True, False]) -@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) -@pytest.mark.parametrize("num_prompts", NUM_PROMPTS) -@pytest.mark.parametrize("num_logprobs", [None, 5]) -@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN", "FLASHINFER"]) -def test_multi_step_llm( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - tp_size: int, - enable_chunked_prefill: bool, - max_tokens: int, - enforce_eager: int, - num_scheduler_steps: int, - num_prompts: int, - num_logprobs: Optional[int], - attention_backend: str, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """Test vLLM engine with multi-step scheduling via sync LLM Engine. - - Set up a HuggingFace (HF) transformers model as a ground-truth reference. - - Prompt them with the same example prompts. - - Validate: - * Generated tokens match - * Generated logprobs are all very close - - Args: - hf_runner: HF transformers model runner fixture - vllm_runner: vLLM model runner fixture - example_prompts: test fixture providing example prompts - model: model under test (same for single- and multi-step engines) - dtype: tensor datatype for engine to utilize - tp_size: degree of tensor-parallelism - enable_chunked_prefill: chunked-prefill on/off - max_tokens: the maximum number of tokens to generate - enforce_eager - num_scheduler_steps: for multi-step scheduling, GPU-side steps per - GPU -> CPU output transfer - num_prompts: number of example prompts under test - num_logprobs: corresponds to the `logprobs` argument to the OpenAI - completions endpoint; `None` -> 1 logprob returned. - """ - if current_platform.is_rocm() and \ - (attention_backend == "FLASHINFER" or enable_chunked_prefill): - pytest.skip( - "Multi-Step with FLASHINFER or Chunked-Prefill is not supported" - "on ROCm") - - with monkeypatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - - prompts = example_prompts - if len(prompts) < num_prompts: - prompts = prompts * ((num_prompts // len(prompts)) + 1) - prompts = prompts[:num_prompts] - assert len(prompts) == num_prompts - - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - enable_chunked_prefill=enable_chunked_prefill, - num_scheduler_steps=num_scheduler_steps, - ) as vllm_model: - vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens) - if num_logprobs is None else - vllm_model.generate_greedy_logprobs( - prompts, max_tokens, num_logprobs)) - - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = (hf_model.generate_greedy(prompts, max_tokens) - if num_logprobs is None else - hf_model.generate_greedy_logprobs_limit( - prompts, max_tokens, num_logprobs)) - - if num_logprobs is None: - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - else: - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("tp_size", [1]) -@pytest.mark.parametrize("max_tokens", [5]) -@pytest.mark.parametrize("enforce_eager", [True]) -@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) -@pytest.mark.parametrize("num_prompts", NUM_PROMPTS) -@pytest.mark.parametrize("num_logprobs,num_prompt_logprobs", [(5, 5)]) -@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"]) -def test_multi_step_llm_w_prompt_logprobs( - vllm_runner, - example_prompts, - model: str, - dtype: str, - tp_size: int, - max_tokens: int, - enforce_eager: int, - num_scheduler_steps: int, - num_prompts: int, - num_logprobs: Optional[int], - num_prompt_logprobs: Optional[int], - attention_backend: str, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """Test prompt logprobs with multi-step scheduling via sync LLM Engine. - - Set up a vLLM engine instance w/ single-step scheduling as a ground-truth - reference. - - Prompt them with the same example prompts. - - Validate: - * All generated logprobs are all very close - - Args: - hf_runner: HF transformers model runner fixture - vllm_runner: vLLM model runner fixture - example_prompts: test fixture providing example prompts - model: model under test (same for single- and multi-step engines) - dtype: tensor datatype for engine to utilize - tp_size: degree of tensor-parallelism - max_tokens: the maximum number of tokens to generate - enforce_eager - num_scheduler_steps: for multi-step scheduling, GPU-side steps per - GPU -> CPU output transfer - num_prompts: number of example prompts under test - num_logprobs: corresponds to the `logprobs` argument to the OpenAI - completions endpoint; `None` -> no logprobs - num_prompt_logprobs: number of logprobs to return for each prompt token; - note that this argument is not supported by the - OpenAI completions endpoint. - """ - with monkeypatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - - prompts = example_prompts - if len(prompts) < num_prompts: - prompts = prompts * ((num_prompts // len(prompts)) + 1) - prompts = prompts[:num_prompts] - assert len(prompts) == num_prompts - - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - num_scheduler_steps=num_scheduler_steps, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - prompts, - max_tokens, - num_logprobs, - num_prompt_logprobs=num_prompt_logprobs) - - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - ) as vllm_model: - single_step_vllm_outputs = vllm_model.generate_greedy_logprobs( - prompts, - max_tokens, - num_logprobs, - num_prompt_logprobs=num_prompt_logprobs) - - check_logprobs_close( - outputs_0_lst=single_step_vllm_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("tp_size", [1]) -@pytest.mark.parametrize("max_tokens", [5]) -@pytest.mark.parametrize("enforce_eager", [True]) -@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) -@pytest.mark.parametrize("num_prompts", NUM_PROMPTS) -@pytest.mark.parametrize("num_logprobs", [None, 5]) -@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"]) -@pytest.mark.skipif( - current_platform.is_rocm(), - reason="Multi-Step + Chunked-Prefill not supported on ROCm") -def test_multi_step_llm_chunked_prefill_prefix_cache( - vllm_runner, - example_prompts, - model: str, - dtype: str, - tp_size: int, - max_tokens: int, - enforce_eager: int, - num_scheduler_steps: int, - num_prompts: int, - num_logprobs: Optional[int], - attention_backend: str, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """Test vLLM engine with multi-step+"single-step chunked prefill"+APC. - - Set up contrived scenario which tests for a possible failure mode of - scheduling with multi-step+"single-step chunked prefill"+APC - - "single-step chunked prefill" here refers to the current vLLM multi-step+ - chunked-prefill implementation, which requires that a prefill may only - be scheduled in the same step as decodes if the prefill prompt fits in a - single chunk (note that "complete" multi-step+chunked-prefill would allow - a prefill to span multiple chunks & multiple steps but that is not yet - the case.) - - "APC" is short for "automatic prefix caching". - - This test creates a scenario where the scheduler must decide whether/how - to schedule a prefill with a prompt that exceeds the available token budget. - The correct behavior for multi-step+"single-step chunked prefill"+APC is to - put off scheduling the prefill until a future step. - - Validate that: - * Multi-step kernels do not raise an exception due to incorrect scheduler - behavior - * Generated tokens match between - multi-step+"single-step chunked prefill"+APC and - single-step scheduling. - * (If logprobs are enabled) check logprobs are close enough - - Args: - vllm_runner: vLLM model runner fixture - example_prompts: test fixture providing example prompts - model: model under test (same for single- and multi-step engines) - dtype: tensor datatype for engine to utilize - tp_size: degree of tensor-parallelism - max_tokens: the maximum number of tokens to generate - enforce_eager - num_scheduler_steps: for multi-step scheduling, GPU-side steps per - GPU -> CPU output transfer - num_prompts: number of example prompts under test - num_logprobs: corresponds to the `logprobs` argument to the OpenAI - completions endpoint; `None` -> 1 logprob returned. - """ - - # Set up contrived test for correct scheduling behavior with - # multi-step+"single-step chunked prefill"+APC. - # - # Assume block_size=16 - # - # Assume max_num_batched_tokens=48 - # => Per-step token budget=48 - # - # 1. Scheduler schedules 0th prompt (24 tokens) - # => Remaining token budget=24 - # 2. Scheduler attempts to schedule 1st prompt (30 tokens) - # * 30 tokens exceeds 24 token remaining budget - # * Correct behavior: do not schedule this prompt in this step - # * Incorrect behavior: schedule prompt chunk - # * `do_sample=False` for this prompt in this step - # * Chunk size = (remaining tokens // block size) * block size - # - # The Incorrect scheduling behavior - if it occurs - will cause an exception - # in the model runner resulting from `do_sample=False`. - with monkeypatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - - assert len(example_prompts) >= 2 - challenge_prompts = copy.deepcopy(example_prompts) - challenge_prompts[0] = ( - 'vLLM is a high-throughput and memory-efficient ' - 'inference and serving engine for LLMs.\n') # 24 tok - challenge_prompts[1] = ( - 'Briefly describe the major milestones in the ' - 'development of artificial intelligence from 1950 to 2020.\n' - ) # 30 tok - - # If necessary, adjust the length of `challenge_prompts` to match - # `num_prompts` - if len(challenge_prompts) < num_prompts: - challenge_prompts = (challenge_prompts * - ((num_prompts // len(challenge_prompts)) + 1)) - challenge_prompts = challenge_prompts[:num_prompts] - assert len(challenge_prompts) == num_prompts - - # Single-step scheduler baseline - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - num_scheduler_steps=num_scheduler_steps, - max_model_len=48, - max_num_batched_tokens=48, - max_num_seqs=4, - block_size=16, - ) as vllm_model: - outputs_baseline = ( - vllm_model.generate_greedy(challenge_prompts, max_tokens) if - num_logprobs is None else vllm_model.generate_greedy_logprobs( - challenge_prompts, max_tokens, num_logprobs)) - - # multi-step+"single-step chunked prefill"+APC - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - enable_chunked_prefill=True, - enable_prefix_caching=True, - num_scheduler_steps=num_scheduler_steps, - max_model_len=48, - max_num_batched_tokens=48, - max_num_seqs=4, - block_size=16, - ) as vllm_model: - outputs_w_features = ( - vllm_model.generate_greedy(challenge_prompts, max_tokens) if - num_logprobs is None else vllm_model.generate_greedy_logprobs( - challenge_prompts, max_tokens, num_logprobs)) - - if num_logprobs is None: - # No-logprobs test - check_outputs_equal( - outputs_0_lst=outputs_baseline, - outputs_1_lst=outputs_w_features, - name_0="multi-step", - name_1="multi-step+features", - ) - else: - # Yes-logprobs test - check_logprobs_close( - outputs_0_lst=outputs_baseline, - outputs_1_lst=outputs_w_features, - name_0="multi-step", - name_1="multi-step+features", - ) diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py deleted file mode 100644 index 123f9595e9..0000000000 --- a/tests/samplers/test_logits_processor.py +++ /dev/null @@ -1,70 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import torch - -from vllm import SamplingParams - -MODELS = ["distilbert/distilgpt2"] - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This file tests V0 internals, so set VLLM_USE_V1=0. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -def test_logits_processor_force_generate( - vllm_runner, - example_prompts, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype) as vllm_model: - tokenizer = vllm_model.llm.get_tokenizer() - repeat_times = 2 - enforced_answers = " vLLM" - vllm_token_ids = tokenizer.encode(enforced_answers, - add_special_tokens=False) - max_tokens = len(vllm_token_ids) * repeat_times - - def pick_vllm(token_ids, logits): - token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)] - logits[token_id] = torch.finfo(logits.dtype).max - return logits - - params_with_logprobs = SamplingParams( - logits_processors=[pick_vllm], - prompt_logprobs=3, - max_tokens=max_tokens, - ) - - # test logits_processors when prompt_logprobs is not None - vllm_model.llm._add_request( - example_prompts[0], - params=params_with_logprobs, - ) - - # test prompt_logprobs is not None - vllm_model.llm._add_request( - example_prompts[1], - params=SamplingParams( - prompt_logprobs=3, - max_tokens=max_tokens, - ), - ) - - # test grouped requests - vllm_model.llm._add_request( - example_prompts[2], - params=SamplingParams(max_tokens=max_tokens), - ) - - outputs = vllm_model.llm._run_engine(use_tqdm=False) - - assert outputs[0].outputs[0].text == enforced_answers * repeat_times diff --git a/tests/tpu/lora/test_lora.py b/tests/tpu/lora/test_lora.py index 4c47b8c43c..636108e985 100644 --- a/tests/tpu/lora/test_lora.py +++ b/tests/tpu/lora/test_lora.py @@ -30,7 +30,6 @@ def use_v1_only(monkeypatch: pytest.MonkeyPatch): def setup_vllm(num_loras: int, tp: int) -> vllm.LLM: return vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct", - num_scheduler_steps=1, max_model_len=256, max_seq_len_to_capture=256, max_num_seqs=8, diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index a2db1ae684..8be1e103dc 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -236,7 +236,6 @@ def test_config_args(parser_with_config, cli_config_file): ['serve', 'mymodel', '--config', cli_config_file]) assert args.tensor_parallel_size == 2 assert args.trust_remote_code - assert not args.multi_step_stream_outputs def test_config_file(parser_with_config): @@ -828,7 +827,6 @@ def test_model_specification(parser_with_config, cli_config_file, ]) assert args.tensor_parallel_size == 2 assert args.trust_remote_code is True - assert args.multi_step_stream_outputs is False assert args.port == 12312 diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index a756c89b52..1f16e92f65 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -58,12 +58,6 @@ def test_unsupported_configs(monkeypatch): disable_async_output_proc=True, ).create_engine_config() - with pytest.raises(NotImplementedError): - AsyncEngineArgs( - model=MODEL, - num_scheduler_steps=5, - ).create_engine_config() - with pytest.raises(NotImplementedError): AsyncEngineArgs( model=MODEL, diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py index ec33d334ab..2031f41fab 100644 --- a/tests/worker/test_model_input.py +++ b/tests/worker/test_model_input.py @@ -11,7 +11,6 @@ from vllm.attention.backends.utils import CommonAttentionState from vllm.model_executor import SamplingMetadata from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata -from vllm.worker.multi_step_model_runner import StatefulModelInput from vllm.worker.pooling_model_runner import ( ModelInputForGPUWithPoolingMetadata) @@ -166,81 +165,3 @@ def test_embedding_model_runner_input(): None) == getattr(attn_metadata, field.name, None) # Pooling metadata is not broadcast. assert received_model_input.pooling_metadata is None - - -def test_multi_step_model_runner_input(): - sampling_metadata = SamplingMetadata( - ["seq_group"], - "selected_token_indices", - "categorized_sample_indices", - "num_prompts", - ) - attn_metadata = AttentionMetadata( - num_prefills=1, - num_prefill_tokens=2, - num_decode_tokens=3, - slot_mapping=torch.zeros(1), - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=True, - ) - frozen_model_input = ModelInputForGPUWithSamplingMetadata( - input_tokens=torch.ones(10), - input_positions=torch.ones(10), - sampling_metadata=sampling_metadata, - attn_metadata=attn_metadata) - - model_input = StatefulModelInput( - frozen_model_input=frozen_model_input, - is_last_step=True, - is_first_multi_step=False, - current_step=4, - last_sampled_token_ids=torch.ones((10, 1)), - is_multi_step=True, - num_queries=8, - num_seqs=5, - cached_outputs=[], - ) - - assert isinstance(model_input, StatefulModelInput) - - # Test round trip serialization. - tensor_dict = model_input.as_broadcastable_tensor_dict() - attn_backend = MockAttentionBackend() - received_model_input = (StatefulModelInput.from_broadcasted_tensor_dict( - tensor_dict, attn_backend=attn_backend)) - - received_frozen_input = received_model_input.frozen_model_input - - # Check that received copy has correct values. - assert isinstance(received_model_input, StatefulModelInput) - assert received_frozen_input.input_tokens is not None - assert (received_frozen_input.input_tokens == - frozen_model_input.input_tokens).all() - assert received_frozen_input.input_positions is not None - assert (received_frozen_input.input_positions == - frozen_model_input.input_positions).all() - assert received_frozen_input.multi_modal_kwargs is None - assert (frozen_model_input.multi_modal_kwargs == - frozen_model_input.multi_modal_kwargs) - assert received_frozen_input.lora_requests is None - assert (received_frozen_input.lora_requests == - frozen_model_input.lora_requests) - assert received_frozen_input.lora_mapping is None - assert ( - received_frozen_input.lora_mapping == frozen_model_input.lora_mapping) - for field in dataclasses.fields(AttentionMetadata): - assert getattr(received_frozen_input.attn_metadata, field.name, - None) == getattr(attn_metadata, field.name, None) - # For sampling metadata, only selected_token_indices is copied. - assert (received_frozen_input.sampling_metadata.selected_token_indices == - sampling_metadata.selected_token_indices) - assert received_frozen_input.sampling_metadata.seq_groups is None - - # check non frozen fields - assert received_model_input.is_last_step == model_input.is_last_step - assert (received_model_input.is_first_multi_step == - model_input.is_first_multi_step) - assert received_model_input.current_step == model_input.current_step - assert (received_model_input.last_sampled_token_ids == - model_input.last_sampled_token_ids).all() - assert received_model_input.is_multi_step == model_input.is_multi_step diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index df4eb33f5d..6649cd89ee 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3779,8 +3779,6 @@ class VllmConfig: f"observability_config={self.observability_config!r}, " f"seed={self.model_config.seed}, " f"served_model_name={self.model_config.served_model_name}, " - f"num_scheduler_steps={self.scheduler_config.num_scheduler_steps}, " - f"multi_step_stream_outputs={self.scheduler_config.multi_step_stream_outputs}, " # noqa f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, " f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa f"use_async_output_proc={self.model_config.use_async_output_proc}, " diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 61346da145..63894e7f5d 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -929,8 +929,7 @@ class Scheduler: ) def _get_prompt_limit(self, seq_group: SequenceGroup) -> int: - if (self.scheduler_config.chunked_prefill_enabled - and not self.scheduler_config.is_multi_step): + if self.scheduler_config.chunked_prefill_enabled: prompt_limit = self.scheduler_config.max_model_len else: prompt_limit = min( @@ -1114,9 +1113,6 @@ class Scheduler: continue num_lookahead_slots: int = 0 - if self.scheduler_config.is_multi_step and enable_chunking: - num_lookahead_slots = self._get_num_lookahead_slots( - True, enable_chunking) # If the sequence group cannot be allocated, stop. can_allocate = self.block_manager.can_allocate( @@ -1195,24 +1191,6 @@ class Scheduler: partial_prefill_metadata.maybe_increment_partial_prefills( seq_group) - if enable_chunking and self.scheduler_config.is_multi_step: - blocks_to_copy: List[Tuple[int, int]] = [] - # init_multi_step_from_lookahead_slots happens in append_slots - self._append_slots(seq_group, blocks_to_copy, enable_chunking) - # This assert will trip when a copy-on-write happens. This is - # not a concern as the very first sequence-group block - # allocation happens above. Still, we have the assert to - # catch any edge-cases. - assert not blocks_to_copy - else: - seq_group.init_multi_step_from_lookahead_slots( - num_lookahead_slots, - num_scheduler_steps=self.scheduler_config. - num_scheduler_steps, - is_multi_step=self.scheduler_config.is_multi_step, - enable_chunking=enable_chunking, - ) - seq_groups.append( ScheduledSequenceGroup(seq_group=seq_group, token_chunk_size=num_new_tokens)) @@ -1453,14 +1431,6 @@ class Scheduler: num_prefill_groups = (len(prefills.seq_groups) + len(swapped_in.prefill_seq_groups) + len(running_scheduled.prefill_seq_groups)) - # If all prompts, then we set num_lookahead_slots to 0 - # this allows us to go through the `no_spec` path in - # `spec_decode_worker.py` - all_prefills = len(scheduled_seq_groups) == num_prefill_groups - num_lookahead_slots = (0 if - (all_prefills - and not self.scheduler_config.is_multi_step) - else running_scheduled.num_lookahead_slots) return SchedulerOutputs( scheduled_seq_groups=scheduled_seq_groups, num_prefill_groups=num_prefill_groups, @@ -1472,7 +1442,7 @@ class Scheduler: swapped_in.blocks_to_copy, ignored_seq_groups=prefills.ignored_seq_groups + swapped_in.infeasible_seq_groups, - num_lookahead_slots=num_lookahead_slots, + num_lookahead_slots=0, running_queue_size=len(self.running), preempted=(len(running_scheduled.preempted) + len(running_scheduled.swapped_out)), @@ -1516,11 +1486,6 @@ class Scheduler: num_lookahead_slots = self._get_num_lookahead_slots( is_prefill, enable_chunking) - if is_prefill and num_lookahead_slots > 0: - # Appending prefill slots only happens multi-step and - # chunked-prefill are enabled together. - assert self.scheduler_config.is_multi_step and enable_chunking - return self.block_manager.can_append_slots( seq_group=seq_group, num_lookahead_slots=num_lookahead_slots) @@ -1776,19 +1741,7 @@ class Scheduler: num_lookahead_slots: int = self._get_num_lookahead_slots( is_prefill, enable_chunking) - seq_group.init_multi_step_from_lookahead_slots( - num_lookahead_slots, - num_scheduler_steps=self.scheduler_config.num_scheduler_steps, - is_multi_step=self.scheduler_config.is_multi_step, - enable_chunking=enable_chunking, - ) - seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING - if self.scheduler_config.is_multi_step and enable_chunking: - # In multi-step chunked-prefill any sequence type can have - # slots appended. - seq_status = None - for seq in seq_group.get_seqs(status=seq_status): cows = self.block_manager.append_slots(seq, num_lookahead_slots) if len(cows) > 0: @@ -1904,29 +1857,8 @@ class Scheduler: """The number of slots to allocate per sequence per step, beyond known token ids. Speculative decoding uses these slots to store KV activations of tokens which may or may not be accepted. - - Speculative decoding does not yet support prefill, so we do not perform - lookahead allocation for prefill. - - When chunking is enabled with multi-step, we allocate lookahead slots - for the prefills for when the prefills turn into decodes in the first - step. """ - if is_prefill: - if self.scheduler_config.is_multi_step and enable_chunking: - # num_lookahead_slots was introduced in the context of decodes, - # in Speculative Decoding. - # When the num_scheduler_steps is 8, say, then the - # num_lookahead_slots is 7. Meaning, we are doing a 1-step of - # decode anyways and we wish to do 7 more. - # - # "lookaheads" for prefills, is introduced in support for - # Chunked-Prefill in Multi-Step. - return self.scheduler_config.num_lookahead_slots + 1 - else: - return 0 - - return self.scheduler_config.num_lookahead_slots + return 0 def _get_num_new_uncached_and_cached_tokens( self, @@ -2068,24 +2000,6 @@ class Scheduler: The number of new tokens to schedule after chunking. """ remaining_token_budget = budget.remaining_token_budget() - if scheduler_config.is_multi_step: - # The current multi-step + chunked prefill capability does - # not actually support chunking prompts. - # - # Therefore, `num_new_tokens` is computed in the same fashion - # for both multi-step+chunked-prefill & - # multi-step+chunked-prefill+APC - # - # Prompts with more tokens than the current remaining budget - # are postponed to future scheduler steps - if num_new_tokens > prompt_limit: - # If the seq_group is in prompt-stage, pass the - # num_new_tokens as-is so the caller can ignore - # the sequence. - return num_new_tokens - - return 0 if num_new_tokens > \ - remaining_token_budget else num_new_tokens # Get the number of tokens to allocate to this prefill slot prefill_slot_budget = ( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d74db67bda..c058001ceb 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -362,8 +362,6 @@ class EngineArgs: lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size - num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps - multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight num_gpu_blocks_override: Optional[ int] = CacheConfig.num_gpu_blocks_override @@ -799,11 +797,8 @@ class EngineArgs: **scheduler_kwargs["delay_factor"]) scheduler_group.add_argument("--preemption-mode", **scheduler_kwargs["preemption_mode"]) - scheduler_group.add_argument("--num-scheduler-steps", - **scheduler_kwargs["num_scheduler_steps"]) - scheduler_group.add_argument( - "--multi-step-stream-outputs", - **scheduler_kwargs["multi_step_stream_outputs"]) + # multi-step scheduling has been removed; corresponding arguments + # are no longer supported. scheduler_group.add_argument("--scheduling-policy", **scheduler_kwargs["policy"]) scheduler_group.add_argument( @@ -1257,28 +1252,11 @@ class EngineArgs: disable_log_stats=self.disable_log_stats, ) - # Reminder: Please update docs/features/compatibility_matrix.md - # If the feature combo become valid - if self.num_scheduler_steps > 1: - if speculative_config is not None: - raise ValueError("Speculative decoding is not supported with " - "multi-step (--num-scheduler-steps > 1)") - if self.enable_chunked_prefill and self.pipeline_parallel_size > 1: - raise ValueError("Multi-Step Chunked-Prefill is not supported " - "for pipeline-parallel-size > 1") - if current_platform.is_cpu(): - logger.warning("Multi-Step (--num-scheduler-steps > 1) is " - "currently not supported for CPUs and has been " - "disabled.") - self.num_scheduler_steps = 1 - - # make sure num_lookahead_slots is set the higher value depending on - # if we are using speculative decoding or multi-step - num_lookahead_slots = max(self.num_lookahead_slots, - self.num_scheduler_steps - 1) - num_lookahead_slots = num_lookahead_slots \ - if speculative_config is None \ - else speculative_config.num_lookahead_slots + # make sure num_lookahead_slots is set appropriately depending on + # whether speculative decoding is enabled + num_lookahead_slots = self.num_lookahead_slots + if speculative_config is not None: + num_lookahead_slots = speculative_config.num_lookahead_slots scheduler_config = SchedulerConfig( runner_type=model_config.runner_type, @@ -1292,8 +1270,6 @@ class EngineArgs: disable_chunked_mm_input=self.disable_chunked_mm_input, is_multimodal_model=model_config.is_multimodal_model, preemption_mode=self.preemption_mode, - num_scheduler_steps=self.num_scheduler_steps, - multi_step_stream_outputs=self.multi_step_stream_outputs, send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER and parallel_config.use_ray), policy=self.scheduling_policy, @@ -1392,11 +1368,6 @@ class EngineArgs: recommend_to_remove=True) return False - if self.num_scheduler_steps != SchedulerConfig.num_scheduler_steps: - _raise_or_fallback(feature_name="--num-scheduler-steps", - recommend_to_remove=True) - return False - if self.scheduler_delay_factor != SchedulerConfig.delay_factor: _raise_or_fallback(feature_name="--scheduler-delay-factor", recommend_to_remove=True) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 1f962b008e..b6ee410534 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -15,7 +15,7 @@ from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, from vllm.core.scheduler import SchedulerOutputs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_timeout import asyncio_timeout -from vllm.engine.llm_engine import LLMEngine, SchedulerOutputState +from vllm.engine.llm_engine import LLMEngine from vllm.engine.metrics_types import StatLoggerBase from vllm.engine.protocol import EngineClient from vllm.executor.executor_base import ExecutorBase @@ -308,13 +308,6 @@ class _AsyncLLMEngine(LLMEngine): if not allow_async_output_proc and len(ctx.output_queue) > 0: self._process_model_outputs(ctx=ctx) - if (self.scheduler_config.is_multi_step - and scheduler_outputs.num_lookahead_slots > 0): - # cache the scheduler outputs for the next iteration if we have - # lookahead slots - self._cache_scheduler_outputs_for_multi_step( - virtual_engine, seq_group_metadata_list, scheduler_outputs, - allow_async_output_proc) else: finished_requests_ids = list() @@ -351,29 +344,14 @@ class _AsyncLLMEngine(LLMEngine): outputs = await self.model_executor.execute_model_async( execute_model_req) - # we need to do this here so that last step's sampled_token_ids can - # be passed to the next iteration for PP. - if self.scheduler_config.is_multi_step: - self._update_cached_scheduler_output(virtual_engine, outputs) else: if len(ctx.output_queue) > 0: self._process_model_outputs(ctx=ctx) outputs = [] - # Finish the current step for all the sequence groups. - if self.scheduler_config.is_multi_step: - for seq_group in seq_group_metadata_list: - seq_group.finish_step() - if not self._has_remaining_steps(seq_group_metadata_list): - # Clear the cache if we have finished all the steps - if self.scheduler_config.is_multi_step: - self.cached_scheduler_outputs[ - virtual_engine] = SchedulerOutputState() - # is_first_step_output is True only when the num_steps of all - # the sequences are 1. When the num_steps > 1, - # multi_step_model_runner does the first-step output append. + # the sequences are 1. is_first_step_output: bool = False if not seq_group_metadata_list \ else seq_group_metadata_list[0].state.num_steps == 1 diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3fc4f6445d..bbe958351e 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -25,7 +25,6 @@ from vllm.engine.metrics_types import StatLoggerBase, Stats from vllm.engine.output_processor.interfaces import ( SequenceGroupOutputProcessor) from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.engine.output_processor.util import create_output_by_sequence_group from vllm.entrypoints.openai.logits_processors import ( get_logits_processors as get_openai_logits_processors) from vllm.executor.executor_base import ExecutorBase @@ -91,7 +90,7 @@ class OutputData(NamedTuple): class SchedulerContext: - def __init__(self, multi_step_stream_outputs: bool = False): + def __init__(self) -> None: self.output_queue: Deque[OutputData] = deque() self.request_outputs: List[Union[RequestOutput, PoolingRequestOutput]] = [] @@ -99,8 +98,6 @@ class SchedulerContext: List[SequenceGroupMetadata]] = None self.scheduler_outputs: Optional[SchedulerOutputs] = None - self.multi_step_stream_outputs: bool = multi_step_stream_outputs - def append_output(self, outputs: List[SamplerOutput], seq_group_metadata_list: List[SequenceGroupMetadata], scheduler_outputs: SchedulerOutputs, is_async: bool, @@ -303,8 +300,7 @@ class LLMEngine: ] self.scheduler_contexts = [ - SchedulerContext(multi_step_stream_outputs=self.scheduler_config. - multi_step_stream_outputs) + SchedulerContext() for _ in range(self.parallel_config.pipeline_parallel_size) ] @@ -683,8 +679,7 @@ class LLMEngine: "Priority scheduling is not enabled.") if isinstance(params, SamplingParams) \ - and params.logits_processors \ - and self.scheduler_config.num_scheduler_steps > 1: + and params.logits_processors: raise ValueError( "Logits processors are not supported in multi-step decoding") @@ -868,45 +863,6 @@ class LLMEngine: return - def _update_num_computed_tokens_for_multi_step_prefill( - self, seq_group: SequenceGroup, - seq_group_meta: SequenceGroupMetadata, - is_first_step_output: Optional[bool]): - """ - This function updates num_computed_tokens for prompt sequences - when Multi-Step is enabled. - - seq_group: SequenceGroup to update the num_computed_tokens for. - seq_group_meta: Metadata of the given SequenceGroup. - is_first_step_output: Optional[bool] - - When available, is_first_step_output indicates if the appended - output token is the output of the first-step in multi-step. - A value of None indicates that outputs from all steps in - in multi-step are submitted in a single burst. - """ - - assert self.scheduler_config.is_multi_step - - if not seq_group_meta.is_prompt: - # num_computed_token updates for multi-step decodes happen after - # the tokens are appended to the sequence. - return - - do_update: bool = False - if self.scheduler_config.chunked_prefill_enabled: - # In multi-step + chunked-prefill case, the prompt sequences - # that are scheduled are fully processed in the first step. - do_update = is_first_step_output is None or is_first_step_output - else: - # Normal multi-step decoding case. In this case prompt-sequences - # are actually single-stepped. Always update in this case. - assert seq_group.state.num_steps == 1 - do_update = True - - if do_update: - seq_group.update_num_computed_tokens( - seq_group_meta.token_chunk_size) - def _process_model_outputs(self, ctx: SchedulerContext, request_id: Optional[str] = None) -> None: @@ -939,33 +895,8 @@ class LLMEngine: has_multiple_outputs: bool = len(outputs) > 1 outputs_by_sequence_group: List[List[SequenceGroupOutput]] - if has_multiple_outputs: - assert self.scheduler_config.is_multi_step or \ - self.speculative_config - # Organize outputs by [step][sequence group] instead of - # [sequence group][step]. - if self.scheduler_config.is_multi_step: - outputs_by_sequence_group = create_output_by_sequence_group( - outputs, len(seq_group_metadata_list)) - elif self.speculative_config: - # Decodes are multi-steps while prefills are not, outputting at - # most 1 token. Separate them so that we can trigger chunk - # processing without having to pad or copy over prompts K times - # to match decodes structure (costly with prompt_logprobs). - num_prefills = sum(sg.is_prompt - for sg in seq_group_metadata_list) - prefills, decodes = outputs[:num_prefills], outputs[ - num_prefills:] - outputs_by_sequence_group = create_output_by_sequence_group( - decodes, - num_seq_groups=len(seq_group_metadata_list) - num_prefills) - outputs_by_sequence_group = [p.outputs for p in prefills - ] + outputs_by_sequence_group - # We have outputs for multiple steps submitted in a single burst, - # so invalidate is_first_step_output. - is_first_step_output = None - else: - outputs_by_sequence_group = outputs + assert not has_multiple_outputs + outputs_by_sequence_group = outputs # Determine the requests we need to operate on if request_id: @@ -1006,13 +937,8 @@ class LLMEngine: output = [outputs_by_sequence_group[0][i]] if not is_async: - if self.scheduler_config.is_multi_step: - # Updates happen only if the sequence is prefill - self._update_num_computed_tokens_for_multi_step_prefill( - seq_group, seq_group_meta, is_first_step_output) - else: - seq_group.update_num_computed_tokens( - seq_group_meta.token_chunk_size or 0) + seq_group.update_num_computed_tokens( + seq_group_meta.token_chunk_size or 0) if outputs: for o in outputs: @@ -1074,15 +1000,6 @@ class LLMEngine: for scheduler in self.scheduler: scheduler.free_finished_seq_groups() - # For multi-step without streaming, don't create outputs each iteration - if not is_last_step and not ctx.multi_step_stream_outputs: - # Immediately process request outputs here (if callback is given) - if (finished_now - and self.process_request_outputs_callback is not None): - self.process_request_outputs_callback(ctx.request_outputs) - ctx.request_outputs.clear() - return - # Create the outputs for i in indices: if i in skip or i in finished_before or i in finished_now: @@ -1101,13 +1018,7 @@ class LLMEngine: if request_output: ctx.request_outputs.append(request_output) - # For multi-step with streaming, create outputs each iteration - if not is_last_step and ctx.multi_step_stream_outputs: - # Immediately process request outputs here (if callback is given) - if self.process_request_outputs_callback is not None: - self.process_request_outputs_callback(ctx.request_outputs) - ctx.request_outputs.clear() - return + # Create outputs only after processing the scheduler's results for seq_group in scheduler_outputs.ignored_seq_groups: params = seq_group.sampling_params @@ -1157,16 +1068,10 @@ class LLMEngine: if seq_group.is_finished(): continue - if self.scheduler_config.is_multi_step: - # Updates happen only if the sequence is prefill - self._update_num_computed_tokens_for_multi_step_prefill( - seq_group, seq_group_metadata, - seq_group.state.num_steps == 1) - else: - token_chunk_size = (seq_group_metadata.token_chunk_size - if seq_group_metadata.token_chunk_size - is not None else 0) - seq_group.update_num_computed_tokens(token_chunk_size) + token_chunk_size = (seq_group_metadata.token_chunk_size + if seq_group_metadata.token_chunk_size + is not None else 0) + seq_group.update_num_computed_tokens(token_chunk_size) if seq_group_metadata.do_sample: assert len(sequence_group_outputs.samples) == 1, ( @@ -1177,16 +1082,8 @@ class LLMEngine: assert len(seq_group.seqs) == 1 seq = seq_group.seqs[0] - if self.scheduler_config.is_multi_step: - is_prefill_append = seq.data.get_num_uncomputed_tokens( - ) == 0 - seq.append_token_id(sample.output_token, sample.logprobs, - sample.output_embed) - if not is_prefill_append: - seq_group.update_num_computed_tokens(1) - else: - seq.append_token_id(sample.output_token, sample.logprobs, - sample.output_embed) + seq.append_token_id(sample.output_token, sample.logprobs, + sample.output_embed) def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]: """Performs one decoding iteration and returns newly generated results. @@ -1289,13 +1186,6 @@ class LLMEngine: if not allow_async_output_proc and len(ctx.output_queue) > 0: self._process_model_outputs(ctx=ctx) - if (self.scheduler_config.is_multi_step - and scheduler_outputs.num_lookahead_slots > 0): - # cache the scheduler outputs for the next iteration if we have - # lookahead slots - self._cache_scheduler_outputs_for_multi_step( - virtual_engine, seq_group_metadata_list, scheduler_outputs, - allow_async_output_proc) else: finished_requests_ids = list() @@ -1345,10 +1235,6 @@ class LLMEngine: # Raise so the caller is notified that this request failed raise - # We need to do this here so that last step's sampled_token_ids can - # be passed to the next iteration for PP. - if self.scheduler_config.is_multi_step: - self._update_cached_scheduler_output(virtual_engine, outputs) else: # Nothing scheduled => If there is pending async postprocessor, # then finish it here. @@ -1357,19 +1243,9 @@ class LLMEngine: # No outputs in this case outputs = [] - # Finish the current step for all the sequence groups. - if self.scheduler_config.is_multi_step: - for seq_group in seq_group_metadata_list: - seq_group.finish_step() - if not self._has_remaining_steps(seq_group_metadata_list): - # clear the cache if we have finished all the steps. - if self.scheduler_config.is_multi_step: - self.cached_scheduler_outputs[0] = SchedulerOutputState() - # is_first_step_output is True only when the num_steps of all - # the sequences are 1. When the num_steps > 1, - # multi_step_model_runner does the first-step output append. + # the sequences are 1. is_first_step_output: bool = False if not seq_group_metadata_list \ else seq_group_metadata_list[0].state.num_steps == 1 @@ -1453,22 +1329,7 @@ class LLMEngine: def _has_remaining_steps( self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] ) -> bool: - if (not self.scheduler_config.is_multi_step - or not seq_group_metadata_list): - return False - - # TODO(will) this is a sanity check for nowto make sure that all the - # seqs are on the same steps. Eventually we will want to do some sort of - # dynamic scheduling when doing multi-step decoding. - ref_remaining_steps = seq_group_metadata_list[0].state.remaining_steps - if any([ - seq_group.state.remaining_steps != ref_remaining_steps - for seq_group in seq_group_metadata_list[1:] - ]): - raise AssertionError("All running sequence groups should " - "have the same remaining steps.") - - return ref_remaining_steps > 0 + return False def _cache_scheduler_outputs_for_multi_step( self, virtual_engine: int, @@ -1497,13 +1358,6 @@ class LLMEngine: def _get_last_sampled_token_ids( self, virtual_engine: int) -> Optional[torch.Tensor]: - cached_last_output = self.cached_scheduler_outputs[ - virtual_engine].last_output - if (self.scheduler_config.is_multi_step - and self.parallel_config.pipeline_parallel_size > 1 - and cached_last_output is not None - and cached_last_output.sampled_token_ids_cpu is not None): - return cached_last_output.sampled_token_ids_cpu return None def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None: diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 19c5963d32..4d75719c17 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -36,27 +36,13 @@ class SequenceGroupOutputProcessor(ABC): ): """Create an output processor. - This returns a single-step output processor if num_lookahead_slots is - zero, else returns a multi-step output processor. + Multi-step scheduling is no longer supported. Always return a + single-step output processor. """ - if scheduler_config.num_lookahead_slots == 0: - # Importing here to avoid cycle. - from vllm.engine.output_processor.single_step import ( - SingleStepOutputProcessor) - return SingleStepOutputProcessor(scheduler_config, detokenizer, - scheduler, seq_counter, - stop_checker) - else: - # Importing here to avoid cycle. - from vllm.engine.output_processor.multi_step import ( - MultiStepOutputProcessor) - return MultiStepOutputProcessor( - detokenizer, - scheduler, - seq_counter, - get_tokenizer_for_seq, - stop_checker, - ) + from vllm.engine.output_processor.single_step import ( + SingleStepOutputProcessor) + return SingleStepOutputProcessor(scheduler_config, detokenizer, + scheduler, seq_counter, stop_checker) @abstractmethod def process_outputs(self, sequence_group: SequenceGroup, diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py deleted file mode 100644 index 8b66ef0dc7..0000000000 --- a/vllm/engine/output_processor/multi_step.py +++ /dev/null @@ -1,211 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import functools -from typing import Callable, List, cast - -from vllm.core.scheduler import Scheduler -from vllm.engine.output_processor.interfaces import ( - SequenceGroupOutputProcessor) -from vllm.engine.output_processor.single_step import ( - single_step_process_prompt_logprob) -from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.logger import init_logger -from vllm.sampling_params import SamplingParams -from vllm.sequence import (VLLM_INVALID_TOKEN_ID, - CompletionSequenceGroupOutput, Sequence, - SequenceGroup, SequenceGroupOutput, SequenceOutput, - SequenceStatus) -from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.utils import Counter - -logger = init_logger(__name__) - - -class MultiStepOutputProcessor(SequenceGroupOutputProcessor): - """SequenceGroupOutputProcessor which handles logic related to - detokenization and stopping conditions. It specializes to "multi-step - decoding", where vLLM's worker may generate multiple tokens per invocation. - This is currently mutually exclusive with advanced sampling techniques like - beam search, which motivates the separation of this logic from the single - step output processor. - - This class is responsible for things such as correctly appending all new - token ids to their sequence, detokenizing new token ids, truncating new - output tokens after an eos token, and correctly handling the case where the - number of new output tokens per sequence differs in a single batch. - """ - - def __init__( - self, - detokenizer: Detokenizer, - scheduler: List[Scheduler], - seq_counter: Counter, - get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer], - stop_checker: StopChecker, - ): - self.detokenizer = detokenizer - self.scheduler = scheduler - self.seq_counter = seq_counter - self.get_tokenizer_for_seq = get_tokenizer_for_seq - self.stop_checker = stop_checker - - def process_prompt_logprob(self, seq_group: SequenceGroup, - outputs: List[SequenceGroupOutput]) -> None: - """Process prompt logprobs associated with each step of a multi-step- - scheduled computation. - - Args: - seq_group: the outputs are associated with this - [`SequenceGroup`][vllm.sequence.SequenceGroup] - outputs: the - [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]s - for all scheduler steps - """ - for output in outputs: - # Concatenate single-step prompt logprob processing results. - assert isinstance(output, CompletionSequenceGroupOutput) - single_step_process_prompt_logprob(self, seq_group, output) - - @staticmethod - @functools.lru_cache - def _log_prompt_logprob_unsupported_warning_once(): - # Reminder: Please update docs/features/compatibility_matrix.md - # If the feature combo become valid - logger.warning( - "Prompt logprob is not supported by multi step workers. " - "(e.g., speculative decode uses multi step workers).") - - def process_outputs(self, - sequence_group: SequenceGroup, - outputs: List[SequenceGroupOutput], - is_async: bool = False) -> None: - """Append new tokens in the outputs to sequences in the sequence group. - - This only supports sequence groups of size 1. It supports greater than - one new token per sequence. - - This applies logic like stop condition checking and detokenization. - It also handles cases where there are tokens emitted after - the EOS token. - - is_async - Indicates whether this postprocessor runs in - parallel with the GPU forward pass and is processing - tokens from the previous step. If this is true, then - no tokens need to be appended since it is already done - externally (before the next schedule() call) - """ - # Sequences can be in RUNNING or FINISHED_ABORTED state - # once scheduled, as a sequence is moved to FINISHED_ABORTED - # if a client disconnects from the api server. - seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING) - if seqs is None: - seqs = sequence_group.get_seqs( - status=SequenceStatus.FINISHED_ABORTED) - - assert seqs, "Expected RUNNING or FINISHED_ABORTED sequences" - assert len(seqs) == 1, ( - "Beam search not supported in multi-step decoding.") - seq = seqs[0] - seq_id = seq.seq_id - # This method is defined in the more generic - # SequenceGroupOutputProcessor, but here we assume that the outputs are - # of a more specific type. - assert all([ - isinstance(output, CompletionSequenceGroupOutput) - for output in outputs - ]) - compl_outputs = cast(List[CompletionSequenceGroupOutput], outputs) - assert all([ - seq_id == output.samples[0].parent_seq_id - for output in compl_outputs - ]) - - if is_async: - # Async case: We process tokens one by one. Here, we know the token - # was already appended, so we only need to do the rest of the - # postprocessor: Detokenization + stopping logic - self._process_decode_and_stop(seq, sequence_group.sampling_params) - else: - # Standard multi-step case - - # Since there's only one sequence per sequence group, - # we can take the first sample. - samples = [output.samples[0] for output in compl_outputs] - - # entries in sample tokens may be invalid (eg. due to spec decode - # rejecting tokens). - valid_samples = [ - sample for sample in samples - if sample.output_token != VLLM_INVALID_TOKEN_ID - ] - - # When both spec-decode and pre-fill chunking are enabled, we - # don't have guaranteed samples here (e.g. all -1s). - if valid_samples: - self._process_seq_outputs(seq, valid_samples, - sequence_group.sampling_params) - - def _process_decode_and_stop(self, seq: Sequence, - sampling_params: SamplingParams) -> None: - new_char_count = 0 - if sampling_params.detokenize and self.detokenizer: - new_char_count = self.detokenizer.decode_sequence_inplace( - seq, sampling_params) - - # TODO(sang): Support lora. - self.stop_checker.maybe_stop_sequence( - seq, - new_char_count=new_char_count, - sampling_params=sampling_params, - ) - - def _process_seq_outputs(self, seq: Sequence, - valid_samples: List[SequenceOutput], - sampling_params: SamplingParams) -> None: - output_token_ids = [sample.output_token for sample in valid_samples] - output_logprobs = [sample.logprobs for sample in valid_samples] - output_embeds = [sample.output_embed for sample in valid_samples] - - # Truncate to max_tokens if necessary. - remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() + - len(output_token_ids)) - if remaining_tokens < 0: - output_token_ids = output_token_ids[:remaining_tokens] - - # Truncate any tokens after EOS. This is required as spec decode - # generates a fixed number of tokens without evaluating stopping - # conditions within the block. This can cause an eos token to be - # unintentionally ignored. - if not sampling_params.ignore_eos and self.detokenizer: - eos_token_id = self.get_tokenizer_for_seq(seq).eos_token_id - # Avoiding .index calls as exception throwing in the happy path - # is expensive. - for i in range(len(output_token_ids)): - if output_token_ids[i] == eos_token_id: - output_token_ids = output_token_ids[:i + 1] - break - - is_prefill_sampled_token = seq.data.get_num_uncomputed_tokens() == 0 - # Incrementally append tokens to the sequence, as if we had only one new - # token. - for output_token_id, output_logprob, output_embed in zip( - output_token_ids, output_logprobs, output_embeds): - seq.append_token_id( - token_id=output_token_id, - logprobs=output_logprob, - token_embed=output_embed, - ) - - if is_prefill_sampled_token: - is_prefill_sampled_token = False - else: - # Update num_computed_tokens iff the sampled token is not from - # a prefill step. - seq.data.update_num_computed_tokens(1) - - self._process_decode_and_stop(seq, sampling_params) - - if seq.is_finished(): - break diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index c876c52a2e..7095913157 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -118,20 +118,10 @@ class CudaPlatformBase(Platform): @classmethod def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: parallel_config = vllm_config.parallel_config - scheduler_config = vllm_config.scheduler_config model_config = vllm_config.model_config if parallel_config.worker_cls == "auto": - if scheduler_config.is_multi_step: - if envs.VLLM_USE_V1: - raise NotImplementedError( - "Multi-step scheduling is not supported (and not " - "needed) on vLLM V1. Please launch without " - "--num-scheduler-steps.") - else: - parallel_config.worker_cls = \ - "vllm.worker.multi_step_worker.MultiStepWorker" - elif vllm_config.speculative_config: + if vllm_config.speculative_config: if not envs.VLLM_USE_V1: raise NotImplementedError( "Speculative decoding is not supported on vLLM V0.") @@ -139,7 +129,7 @@ class CudaPlatformBase(Platform): else: if envs.VLLM_USE_V1: parallel_config.worker_cls = \ - "vllm.v1.worker.gpu_worker.Worker" + "vllm.v1.worker.gpu_worker.Worker" else: parallel_config.worker_cls = "vllm.worker.worker.Worker" diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 8005830f55..2d5bee5fc5 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -327,18 +327,8 @@ class RocmPlatform(Platform): cache_config.block_size = 16 parallel_config = vllm_config.parallel_config - scheduler_config = vllm_config.scheduler_config if parallel_config.worker_cls == "auto": - if scheduler_config.is_multi_step: - if envs.VLLM_USE_V1: - raise NotImplementedError( - "Multi-step scheduling is not supported (and not " - "needed) on vLLM V1. Please launch without " - "--num-scheduler-steps.") - else: - parallel_config.worker_cls = \ - "vllm.worker.multi_step_worker.MultiStepWorker" - elif vllm_config.speculative_config: + if vllm_config.speculative_config: if not envs.VLLM_USE_V1: raise NotImplementedError( "Speculative decoding is not supported on vLLM V0.") @@ -346,7 +336,7 @@ class RocmPlatform(Platform): else: if envs.VLLM_USE_V1: parallel_config.worker_cls = \ - "vllm.v1.worker.gpu_worker.Worker" + "vllm.v1.worker.gpu_worker.Worker" else: parallel_config.worker_cls = "vllm.worker.worker.Worker" diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index c56096d936..c7522a89c2 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -133,18 +133,13 @@ class TpuPlatform(Platform): parallel_config = vllm_config.parallel_config scheduler_config = vllm_config.scheduler_config if parallel_config.worker_cls == "auto": - if scheduler_config.is_multi_step: - raise NotImplementedError( - "Multi-step scheduling is not supported (and not " - "needed) on vLLM V1. Please launch without " - "--num-scheduler-steps.") parallel_config.worker_cls = "vllm.v1.worker.tpu_worker.TPUWorker" assert not vllm_config.speculative_config, ( "Speculative decoding is not yet supported for TPU backend") if scheduler_config.is_multimodal_model and not \ - scheduler_config.disable_chunked_mm_input: + scheduler_config.disable_chunked_mm_input: logger.warning("TPU does not support running Multimodal models"\ " without setting `--disable_chunked_mm_input`. " \ "Forcing --disable_chunked_mm_input.") diff --git a/vllm/sequence.py b/vllm/sequence.py index 6e65a2bd03..cbe63f8d1d 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -794,35 +794,6 @@ class SequenceGroup: def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 - def init_multi_step(self, num_steps: int) -> None: - self.state.num_steps = num_steps - self.state.current_step = 0 - - def init_multi_step_from_lookahead_slots(self, num_lookahead_slots: int, - num_scheduler_steps: int, - is_multi_step: bool, - enable_chunking: bool) -> None: - - if not is_multi_step: - self.init_multi_step(num_steps=num_scheduler_steps) - return - - # Multi-Step case - is_prefill = self.is_prefill() - - # The asserts below reflect the expectations of the current system. - if is_prefill and enable_chunking: - assert num_lookahead_slots == num_scheduler_steps - self.init_multi_step(num_steps=num_lookahead_slots) - else: - is_decode: bool = not is_prefill - # If it is a prefill, num_lookahead_slots must be 0 - assert num_lookahead_slots == 0 or is_decode - # If it is a decode, num_lookahead_slots + 1 must match - # the scheduler steps. - assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill - self.init_multi_step(num_steps=num_lookahead_slots + 1) - def set_last_token_time(self, now: float) -> None: """Sets the last token time for Request level timings.""" # If still in prefill phase, assertion fails. @@ -1367,15 +1338,6 @@ class ExecuteModelRequest( # Async callback async_callback: Optional[Callable] = None - @property - def is_first_multi_step(self) -> bool: - # TODO(will) make this be able to handle batches with variable number of - # steps - assert len(self.seq_group_metadata_list) > 0 - first_seq_group = self.seq_group_metadata_list[0] - assert first_seq_group.state is not None - return first_seq_group.state.current_step == 0 - @property def is_last_step(self) -> bool: # TODO(will) make this be able to handle batches with variable number of diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 20b9b733cd..a63797e3a4 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -508,8 +508,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): if inter_data.is_prompt: context_len = seq_data.get_num_computed_tokens() seq_len = min(seq_len, context_len + token_chunk_size) - elif self.runner.scheduler_config.is_multi_step or \ - self.runner.model_config.is_encoder_decoder: + elif self.runner.model_config.is_encoder_decoder: context_len = seq_len - 1 else: context_len = seq_data.get_num_computed_tokens() @@ -778,9 +777,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): int: Returns the determined number of padding sequences. If CUDA graphs is not viable, returns -1. """ - is_mscp: bool = self.runner.scheduler_config.is_multi_step and \ - self.runner.scheduler_config.chunked_prefill_enabled - decode_only = self.decode_only or is_mscp + decode_only = self.decode_only if not decode_only: # Early exit so we can treat num_seqs as the batch_size below. return -1 diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py deleted file mode 100644 index 2aa910bdff..0000000000 --- a/vllm/worker/multi_step_model_runner.py +++ /dev/null @@ -1,908 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -import functools -from dataclasses import dataclass, field -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, - Union) - -import torch - -from vllm.distributed import get_pp_group -from vllm.logger import init_logger -from vllm.model_executor.layers.sampler import (PromptLogprobs, SampleLogprobs, - SamplerOutput, - SamplingMetadata, get_logprobs, - get_pythonized_sample_results) -from vllm.platforms import current_platform -from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors, - Logprob, SequenceGroupMetadata, SequenceOutput) -from vllm.utils import PyObjectCache, async_tensor_h2d, current_stream -from vllm.worker.model_runner import (GPUModelRunnerBase, - ModelInputForGPUWithSamplingMetadata) -from vllm.worker.model_runner_base import ( - BroadcastableModelInput, _init_attn_metadata_from_tensor_dict, - _init_frozen_model_input_from_tensor_dict, - _init_sampling_metadata_from_tensor_dict) - -from ..model_executor.model_loader.tensorizer import TensorizerConfig - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - -logger = init_logger(__name__) - -MULTI_STEP_ATTENTION_BACKENDS = [ - "FLASH_ATTN", "ROCM_FLASH", "FLASHINFER", "NO_ATTENTION" -] -MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN", "FLASHINFER"] - -def _get_supported_attention_backends(chunked_prefill_enabled: bool) \ - -> List[str]: - if chunked_prefill_enabled: - return MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS - else: - return MULTI_STEP_ATTENTION_BACKENDS - - -def seq_output_builder(): - return SequenceOutput( - 0, 0, - {0: Logprob(logprob=float('inf'), rank=None, decoded_token=None)}) - - -def completion_seq_group_output_builder(): - return CompletionSequenceGroupOutput([], None) - - -# Used by pythonization to reduce python object allocations -class PythonizationCache: - - def __init__(self): - self.cached_seq_output = PyObjectCache(seq_output_builder) - self.cached_completion_seq_group_output = PyObjectCache( - completion_seq_group_output_builder) - - def reset(self): - self.cached_seq_output.reset() - self.cached_completion_seq_group_output.reset() - - -@dataclass -class ModelOutput: - """The output of a single model forward pass. - - The sampler_output_ready_event is set when the tensors in - sampler_output are ready (the model+sampler forward pass has - completed). We use the event to synchronize the GPU->CPU transfer, - which we want to only run when the data has been written to the - GPU tensors. Until the event is ready, the tensors in sampler_output - will have garbage data. - - There are two scenarios: - 1. The output tensors are ready and we can pythonize them immediately. - 2. The output tensors are not ready and we need to wait for the event to be - ready. - """ - sampler_output: SamplerOutput - sampler_output_ready_event: torch.cuda.Event - sampled_token_ids: Optional[torch.Tensor] = None - pythonized: bool = False - # On-device tensor containing the logprobs of each token. - logprobs: Optional["torch.Tensor"] = None - pythonization_cache: Optional[PythonizationCache] = None - - def pythonize(self, input_metadata: "StatefulModelInput", - copy_stream: torch.cuda.Stream, - pinned_sampled_token_buffer: torch.Tensor) -> None: - """Pythonize the output. Blocking.""" - if not self.pythonized: - self._pythonize_sampler_output(input_metadata, copy_stream, - pinned_sampled_token_buffer, True) - self.pythonized = True - - def maybe_pythonize(self, input_metadata: "StatefulModelInput", - copy_stream: torch.cuda.Stream, - pinned_sampled_token_buffer: torch.Tensor) -> None: - """Pythonize the output if ready, else return None. Non-blocking.""" - if not self.pythonized: - self.pythonized = self._pythonize_sampler_output( - input_metadata, copy_stream, pinned_sampled_token_buffer, - False) - - def _pythonize_sampler_output(self, input_metadata: "StatefulModelInput", - copy_stream: torch.cuda.Stream, - pinned_sampled_token_buffer: torch.Tensor, - blocking: bool) -> bool: - """ - If blocking is set, will block until the forward pass for the output is - ready and pythonize the output. Upon completing Pythonization, erases - self.logprobs (note that a non-blocking call that is performed when - the sampler output is not yet ready, will not erase self.logprobs.) - """ - assert self.sampled_token_ids is not None - if not blocking and not self.sampler_output_ready_event.query(): - return False - - if blocking: - self.sampler_output_ready_event.synchronize() - with torch.cuda.stream(copy_stream): - _pythonize_sampler_output(input_metadata, self.sampler_output, - pinned_sampled_token_buffer, - self.sampled_token_ids, self.logprobs, - self.pythonization_cache) - - # Erase the logprobs GPU-side tensor. - # Note that although _pythonize_sampler_output() runs in its - # own CUDA stream, nonetheless _pythonize_sampler_output() - # cannot return until Pythonization is complete; therefore - # we know that by the time the CPU reaches this point, - # `self.logprobs` is no longer needed. - self.logprobs = None - return True - - -@dataclass(frozen=False) -class StatefulModelInput(BroadcastableModelInput): - # actual frozen model input dataclass passed to _base_model_runner - frozen_model_input: Optional[ModelInputForGPUWithSamplingMetadata] = None - - # list of model outputs for each step, may not be all pythonized - cached_outputs: List[ModelOutput] = field(default_factory=list) - - # used to pass sampled token ids from the last step to the current step for - # TP workers. Used to append to end of outputs and used by advance_step - last_sampled_token_ids: Optional[torch.Tensor] = None - current_step: int = 0 - is_multi_step: bool = True - is_last_step: bool = False - is_first_multi_step: bool = False - base_output_proc_callback: Optional[Callable] = None - # ping-pong data structures for multi-step to wait on the previous step - step_cuda_events: List[current_platform.Event] = field( - default_factory=lambda: [current_platform.Event(blocking=True)] * 2) - num_seqs: int = -1 - num_queries: int = -1 - num_single_step_prefills: int = 0 - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - assert self.frozen_model_input is not None - tensor_dict = self.frozen_model_input.as_broadcastable_tensor_dict() - new_tensor_dict = { - 'last_sampled_token_ids': self.last_sampled_token_ids, - 'current_step': self.current_step, - 'is_multi_step': self.is_multi_step, - 'is_last_step': self.is_last_step, - 'is_first_multi_step': self.is_first_multi_step, - 'num_seqs': self.num_seqs, - 'num_queries': self.num_queries, - 'num_single_step_prefills': self.num_single_step_prefills, - } - tensor_dict.update(new_tensor_dict) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls, - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "StatefulModelInput": - tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - tensor_dict = _init_frozen_model_input_from_tensor_dict( - ModelInputForGPUWithSamplingMetadata, tensor_dict) - - return cls(**tensor_dict) - - def record_step_event(self, current_stream: torch.cuda.Stream): - # record the event for the current step so that the next step can sync - # on it. We modulo by 2 to keep the events in a circular buffer and - # support any attn backends that may be supported in the future. ie - # Flashinfer would want two DecodeWrappers to overlap the CPU and GPU. - self.step_cuda_events[self.current_step & 1] = \ - torch.cuda.Event(blocking=True) - self.step_cuda_events[self.current_step & 1].record(current_stream) - - def wait_previous_step(self): - # These cuda events are an explicit synchronization to ensure that - # advance_step() (for other attn backends that may be supported in the - # future) do not clobber any data structures that is also used by any - # enqueued forwards steps. For distributed case, only a single event is - # needed, but for single GPU case, since we can let the CPU run much - # further ahead, two events allow us to overlap the advance_step with - # the previous forward (ie using two DecodeWrappers for flashinfer - # backend) - self.step_cuda_events[(self.current_step + 1) & 1].wait() - - def add_sampler_output(self, - sampler_output: SamplerOutput, - sampled_token_ids: Optional[torch.Tensor] = None): - self.cached_outputs.append( - ModelOutput(sampler_output=sampler_output, - sampler_output_ready_event=None, - sampled_token_ids=sampled_token_ids, - pythonized=False)) - - def maybe_advance_sampling_metadata(self, device: str, pin_memory: bool): - """ - sampling_metadata.selected_token_indices is constructed for the - first-step in Multi-Step. However, when chunked-prefill is enabled with - multi-step, the scheduled prompts are fully processed in the - first-step and are processed as decodes in the rest of the steps. - This function updates the sampling_metadata.selected_token_indices - to account for this conversion. - - Example: - Let 2 prompts and 2 decodes be scheduled together. Let the - num-tokens to process for the 2 prompts be 5 and 8 respectively. - - In that case, sampling_metadata.sampled_token_indices will be, - [4, 12, 13, 14] as it is constructed for the first-step in - multi-step. - However, the prompts turns to decodes after the first-step - and the num-tokens for the previously-prompt sequences will - be 1 and 1 as they are decodes now. The self.sampled_token_indices - must be updated to [0,1,2,3]. - """ - assert self.current_step == 1 and self.num_single_step_prefills > 0 - if not get_pp_group().is_last_rank: - return - - assert self.frozen_model_input is not None - assert self.frozen_model_input.sampling_metadata is not None - self.frozen_model_input.sampling_metadata.selected_token_indices = \ - async_tensor_h2d(list(range(self.num_queries)), - dtype=torch.long, - target_device=device, - pin_memory=pin_memory) - - def maybe_advance_frozen_model_input(self, device: str, pin_memory: bool): - """ - Advancing the datastructures of StatefulModelInput::frozen_model_input - is only required when prefills are scheduled with decodes to run in - multi-step. This advancement/correction is required to account for - the conversion of Prefills to Decodes after the first multi-step. - """ - if self.current_step != 1 or self.num_single_step_prefills == 0: - return - - assert self.frozen_model_input is not None - fmi = self.frozen_model_input - - # Truncate input_tokens - assert fmi.input_tokens is not None - assert fmi.input_tokens.shape[0] >= self.num_seqs - fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs] - - # Update frozen_model_input::input_positions. - assert fmi.input_positions is not None - assert fmi.input_positions.shape[0] >= self.num_seqs - fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self. - num_seqs] - - # Assert unsupported - assert fmi.lora_mapping is None - assert fmi.lora_requests is not None - assert len(fmi.lora_requests) == 0 - assert fmi.attn_metadata is not None - assert fmi.multi_modal_kwargs is not None - assert len(fmi.multi_modal_kwargs) == 0 - - self.frozen_model_input = dataclasses.replace( - self.frozen_model_input, - input_tokens=fmi_new_input_tokens, - input_positions=fmi_new_input_positions) - - self.maybe_advance_sampling_metadata(device, pin_memory) - - -# MutableModelInputForGPUWithMultiStepMetadata is not subclass of -# ModelInputForGPU but it wraps the actual input dataclass and adds multi-step -# metadata -# mypy: disable-error-code=type-var -class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]): - # mypy: enable-error-code=type-var - - def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs): - - super().__init__(*args, **kwargs) - - # Check attention backend support. - supported_attention_backends: List[str] = \ - _get_supported_attention_backends( - self.scheduler_config.chunked_prefill_enabled) - if self.attn_backend.get_name() not in supported_attention_backends: - ms_config_str: str = "Multi-Step + Chunked-Prefill" \ - if self.scheduler_config.chunked_prefill_enabled \ - else "Multi-Step" - raise ValueError( - f"{ms_config_str} not supported for attention backend: " - f"{self.attn_backend.get_name()}. Set VLLM_ATTENTION_BACKEND " - f"to a value from {supported_attention_backends}.") - - # uses the base model runner to execute the model and wraps it with - # multi-step logic - self._base_model_runner: GPUModelRunnerBase = base_model_runner - - self.is_multi_step = self.scheduler_config.is_multi_step - self.pinned_sampled_token_ids: Optional[torch.Tensor] = None - - # Using the PythonizationCache in Pipeline-Parallel clobbers the - # SequenceOutput and CompletionSequenceGroupOutput object. - # When cache-reset happens at the last step of a multi-step - # execution, there may be other on-going single-step/multi-step - # executions. The current caching implementation does not check - # for this. - self.pythonization_cache = PythonizationCache() \ - if self.parallel_config.pipeline_parallel_size == 1 else None - - @functools.cached_property - def _copy_stream(self): - # used to copy tensors from GPU to CPU asynchronously - return torch.cuda.Stream() - - def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: Dict[str, Any]) -> StatefulModelInput: - model_input = (StatefulModelInput.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - )) - return model_input - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> StatefulModelInput: - frozen_model_input: ModelInputForGPUWithSamplingMetadata = \ - self._base_model_runner.prepare_model_input( - seq_group_metadata_list, - virtual_engine, - finished_requests_ids) - - assert frozen_model_input.query_lens is not None - assert frozen_model_input.seq_lens is not None - assert frozen_model_input.attn_metadata is not None - num_queries = len(frozen_model_input.query_lens) - num_seqs = len(frozen_model_input.seq_lens) - num_single_step_prefills = frozen_model_input.attn_metadata.num_prefills - - model_input = StatefulModelInput( - frozen_model_input=frozen_model_input, - num_seqs=num_seqs, - num_queries=num_queries, - num_single_step_prefills=num_single_step_prefills) - - return model_input - - def _async_process_outputs(self, model_input: StatefulModelInput, - output_proc_callback: Callable): - # Proceed with pythonization and output_proc in order. - # Stop on the first one that fails to pythonize - output_proc_callback() - - cont = True - for step_num, model_output in enumerate(model_input.cached_outputs): - if not model_output.pythonized: - model_output.maybe_pythonize(model_input, self._copy_stream, - self.pinned_sampled_token_ids) - if model_output.pythonized: - ctx = output_proc_callback.keywords["ctx"] - ctx.append_output( - outputs=[model_output.sampler_output], - seq_group_metadata_list=ctx.seq_group_metadata_list, - scheduler_outputs=ctx.scheduler_outputs, - is_async=False, - is_last_step=False, - is_first_step_output=step_num == 0) - - output_proc_callback() - else: - cont = False - - if not cont: - break - - def _final_process_outputs( - self, model_input: StatefulModelInput, - output_proc_callback: Optional[Callable]) -> List[SamplerOutput]: - assert model_input.frozen_model_input is not None - - has_async_callback = output_proc_callback is not None - - outputs = [] - for step_num, output in enumerate(model_input.cached_outputs): - is_last_step = step_num == len(model_input.cached_outputs) - 1 - - # For non-async case: - # -- We simply add the outputs - # For async case: - # -- Invoke callback, pythonize, add to callback queue and repeat - # -- For last output, just add to callback queue - if has_async_callback: - assert output_proc_callback is not None - - # Invoke callback before pythonize (to overlap with GPU) - output_proc_callback() - - # Pythonize - if not output.pythonized: - output.pythonize(model_input, self._copy_stream, - self.pinned_sampled_token_ids) - - # For non last step, add to callback queue to chain - # callbacks=>pythonize pairs (for GPU overlap) - if not is_last_step: - ctx = output_proc_callback.keywords[ # type: ignore - "ctx"] # type: ignore - ctx.append_output( - outputs=[output.sampler_output], - seq_group_metadata_list=ctx. - seq_group_metadata_list, - scheduler_outputs=ctx.scheduler_outputs, - is_async=False, - is_last_step=False, - is_first_step_output=step_num == 0) - else: - outputs.append(output.sampler_output) - else: - output.pythonize(model_input, self._copy_stream, - self.pinned_sampled_token_ids) - outputs.append(output.sampler_output) - - return outputs - - @torch.inference_mode() - def execute_model( - self, - model_input: StatefulModelInput, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: - """ - Execute the model for a single step and update multi-step - metadata - """ - assert num_steps == 1, "MultiStepModelRunner only supports num_steps=1" - frozen_model_input = model_input.frozen_model_input - assert frozen_model_input is not None - - # path for warm up runs - if not model_input.is_multi_step: - return self._base_model_runner.execute_model( - frozen_model_input, None, intermediate_tensors, num_steps) - - # make sure we skip the sampler on the lask rank and only pythonize - # if CPU is ahead. - if self.is_driver_worker and get_pp_group().is_last_rank: - if self.pinned_sampled_token_ids is None: - self.pinned_sampled_token_ids = torch.zeros( - (self.scheduler_config.max_num_seqs, 1), - dtype=torch.long, - device="cpu", - pin_memory=True) - - self._base_model_runner.sampler.include_gpu_probs_tensor = True - if frozen_model_input.sampling_metadata: - frozen_model_input.sampling_metadata.skip_sampler_cpu_output = ( - True) - - # some pre-execute model logic for multi-step: - # - if it's the first step, we need to reset the sampling tensors - # - if it's not the first step, we need to advance the step using the - # appended sampler output from last iteration - # - also maybe pythonize if CPU is ahead of GPU - - stream = current_stream() - if not model_input.is_first_multi_step: - # Explicitly block on the previous step's forward to make sure we - # don't clobber any GPU tensors still in use. - # This is not needed for flashattn backend, but for other attn - # backends such as flashinfer that performs extra CPU operations on - # input metadata we may need to synchronize any CPU operations that - # might clobber enqueued forwards. (prevents CPU from running too - # far ahead if needed) - model_input.wait_previous_step() - model_input = self._advance_step( - model_input, model_input.cached_outputs[-1].sampler_output) - - # frozen_model_input may have been updated - frozen_model_input = model_input.frozen_model_input - assert frozen_model_input is not None - - if model_input.base_output_proc_callback is None: - assert frozen_model_input is not None - model_input.base_output_proc_callback = \ - frozen_model_input.async_callback - - if frozen_model_input.async_callback is not None: - assert model_input.base_output_proc_callback is not None - async_callback = functools.partial( - self._async_process_outputs, - model_input=model_input, - output_proc_callback=model_input.base_output_proc_callback) - - model_input.frozen_model_input = dataclasses.replace( # type: ignore - model_input.frozen_model_input, - async_callback=async_callback) - # Update the local instance - frozen_model_input = model_input.frozen_model_input - assert frozen_model_input is not None - - # Execute the model - output = self._base_model_runner.execute_model(frozen_model_input, - None, - intermediate_tensors, - num_steps=1) - - # record the event for the current step so that the next step can sync - model_input.record_step_event(stream) - - if get_pp_group().is_last_rank and self.is_driver_worker: - assert isinstance(output, list) - assert len( - output - ) == 1, "MultiStepModelRunner requires single-step base_models" - - # event for the pythonization so that we only pythonize if the - # tensors are ready. May be able to be combined with the step event - output_ready_event = torch.cuda.Event() - output_ready_event.record(stream) - if self.parallel_config.pipeline_parallel_size > 1: - output[0].sampled_token_ids_cpu = output[ - 0].sampled_token_ids.cpu() - model_input.cached_outputs.append( - ModelOutput(output[0], output_ready_event, - output[0].sampled_token_ids, False, - output[0].logprobs, self.pythonization_cache)) - - # These GPU tensors are not required by multi-step; - # erase them to ensure they are not pythonized or - # transferred to CPU - output[0].sampled_token_ids = None - output[0].sampled_token_probs = None - output[0].logprobs = None - - # Pythonize the output if CPU is ahead and the previous step is - # ready. - if frozen_model_input.async_callback is None: - for model_output in model_input.cached_outputs: - model_output.maybe_pythonize(model_input, - self._copy_stream, - self.pinned_sampled_token_ids) - - model_input.current_step += 1 - - if not get_pp_group().is_last_rank: - # Should be IntermediateTensors - assert isinstance(output, IntermediateTensors) - return output - if not self.is_driver_worker: - return [] - - # Pythonize the output and block if needed since it is the last step - if model_input.is_last_step: - outputs = self._final_process_outputs( - model_input, model_input.base_output_proc_callback) - if self.pythonization_cache: - self.pythonization_cache.reset() - return outputs - - # should be [SamplerOutput] - return output - - def _update_sampling_metadata(self, sampling_metadata: SamplingMetadata, - num_seqs: Optional[int], num_queries: int): - - assert sampling_metadata.num_prompts == 0 - assert len(sampling_metadata.seq_groups) == num_queries - assert sampling_metadata.selected_token_indices.shape == ( - num_queries, ) - # assert sampling_metadata.categorized_sample_indices == TODO: Add if needed # noqa: E501 - - # Verify that all sequences are decodes - for i in range(num_queries): - seq_group = sampling_metadata.seq_groups[i] - - assert seq_group.is_prompt is False # No prompt - assert seq_group.prompt_logprob_indices == [] # No prompt - assert seq_group.sample_indices == [i] # Simple - assert seq_group.seq_len is None # Decode - assert seq_group.query_len is None # Decode - - def _advance_step(self, model_input: StatefulModelInput, - out: SamplerOutput) -> StatefulModelInput: - - model_input.maybe_advance_frozen_model_input(self.device, - self.pin_memory) - frozen_model_input = model_input.frozen_model_input - assert frozen_model_input is not None - assert frozen_model_input.input_tokens is not None - assert frozen_model_input.input_tokens.shape[0] == model_input.num_seqs - assert frozen_model_input.attn_metadata is not None - - sampled_token_ids = model_input.cached_outputs[-1].sampled_token_ids - num_seqs = model_input.num_seqs - num_queries = model_input.num_queries - frozen_model_input = model_input.frozen_model_input - assert frozen_model_input is not None - attn_metadata = frozen_model_input.attn_metadata - assert attn_metadata is not None - - turn_prefills_into_decodes: bool = model_input.current_step == 1 and \ - model_input.num_single_step_prefills != 0 - attn_metadata.advance_step( - frozen_model_input, - sampled_token_ids, - self.block_size, - num_seqs, - num_queries, - turn_prefills_into_decodes=turn_prefills_into_decodes) - - return model_input - - def load_model(self) -> None: - self._base_model_runner.load_model() - self.model_memory_usage = self._base_model_runner.model_memory_usage - - def save_sharded_state( - self, - path: str, - pattern: Optional[str] = None, - max_size: Optional[int] = None, - ) -> None: - return self._base_model_runner.save_sharded_state( - path, pattern, max_size) - - def save_tensorized_model(self, - tensorizer_config: TensorizerConfig) -> None: - return self._base_model_runner.save_tensorized_model(tensorizer_config) - - def profile_run(self) -> None: - return self._base_model_runner.profile_run() - - def remove_all_loras(self): - return self._base_model_runner.remove_all_loras() - - def capture_model(self, kv_caches: List[List]) -> None: - return self._base_model_runner.capture_model(kv_caches) - - @property - def vocab_size(self) -> int: - return self._base_model_runner.vocab_size - - -DeferredLogprobsReturnType = Tuple[Optional[List[Optional[PromptLogprobs]]], - Optional[List[SampleLogprobs]]] - - -def deferred_pythonize_logprobs( - output: SamplerOutput, - sampling_metadata: SamplingMetadata, - logprobs_tensor: Optional[torch.Tensor], -) -> DeferredLogprobsReturnType: - """Perform deferred logprob Pythonization. - - 1. Pythonize GPU-side sampler result tensors into CPU-side sampler result. - 2. Pythonize GPU-side logprobs tensor into CPU-side logprobs lists, - utilizing the Pythonized sampler result computed in step 1. - - These deferred computations are not required for single-step scheduling - or the `profile_run()` phase of multi-step scheduling. - - Args: - output: sampler output (under deferred Pythonization) - sampling_metadata - - Returns: - prompt_logprobs (CPU), sample_logprobs (CPU) - """ - - # - Deferred pythonization of sample result - sampler_result = get_pythonized_sample_results( - output.deferred_sample_results_args) - - # - Erase the GPU-side deferred sample_result - # computation args to ensure it is never - # pythonized or transferred to CPU - output.deferred_sample_results_args = None - - # - Deferred pythonization of logprobs - ( - prompt_logprobs, - sample_logprobs, - ) = get_logprobs(logprobs_tensor, sampling_metadata, sampler_result) - assert len(prompt_logprobs) == len(sampling_metadata.seq_groups) - assert len(sample_logprobs) == len(sampling_metadata.seq_groups) - - return prompt_logprobs, sample_logprobs - - -def _pythonize_sampler_output( - model_input: StatefulModelInput, - output: SamplerOutput, - pinned_sampled_token_buffer: torch.Tensor, - sampled_token_ids: torch.Tensor, - logprobs_tensor: Optional[torch.Tensor], - cache: Optional[PythonizationCache], -) -> None: - """ This function is only called when the output tensors are ready. - See [`ModelOutput`][vllm.worker.multi_step_model_runner.ModelOutput]. - - Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, - adding a Pythonized output data structure - ([`CompletionSequenceGroupOutput`][vllm.sequence.CompletionSequenceGroupOutput]) - for each [`SequenceGroup`][vllm.sequence.SequenceGroup]. - - Args: - model_input - output: sampler output - pinned_sampled_token_token_buffer: CPU-side pinned memory - (receives copy of - GPU-side token buffer.) - sampled_token_ids: GPU-side token buffer - logprobs_tensor: GPU-side tensor containing - logprobs computed during sampling - """ - - assert model_input.frozen_model_input is not None - - frozen_model_input = model_input.frozen_model_input - assert frozen_model_input.sampling_metadata is not None - sampling_metadata = frozen_model_input.sampling_metadata - # samples generation should have been skipped - assert not output.outputs - - pinned_buffer = pinned_sampled_token_buffer[:model_input.num_queries] - - # We guarantee output tensors are ready, so it is safe to - # pythonize the sampler output & obtain CPU-side logprobs. - # - # However we should check whether logprobs pythonization may - # be skipped entirely, i.e. because no logprobs were requested - # or pythonization was not deferred. To that end, - # - # * `prompt_logprobs_are_requested_for_prefill` signals that - # there are *any* prefill-phase requests which specify that - # prompt logprobs should be returned. - # - # * `any_logprobs_are_requested` signals that there are any - # requests which (1) specify that sample logprobs should be - # returned, or (2) are in the prefill phase AND specify that - # prompt logprobs should be returned. - # - # Later on, these flags cause adjustments to the pythonization - # process to accommodate logprobs. - - seq_groups = sampling_metadata.seq_groups - prompt_logprobs_are_requested_for_prefill = any([ - sg.sampling_params.prompt_logprobs is not None and sg.is_prompt - for sg in seq_groups - ]) - any_logprobs_are_requested = ( - prompt_logprobs_are_requested_for_prefill - or any([sg.sampling_params.logprobs is not None for sg in seq_groups])) - - if prompt_logprobs_are_requested_for_prefill: - # CPU GPU sync, after gathering *only* sampled tokens (since - # requesting prompt logprobs leads `sampled_token_ids` to - # include prompt token ids in addition to sampled token ids.) - sample_idx_tensor = torch.tensor( - [sdx for sg in seq_groups for sdx in sg.sample_indices]) - pinned_buffer = pinned_buffer.copy_( - sampled_token_ids[sample_idx_tensor, :], non_blocking=False) - else: - # CPU GPU sync - pinned_buffer = pinned_buffer.copy_(sampled_token_ids, - non_blocking=False) - - # this will not block as the tensors are already on CPU - samples_list = pinned_buffer.tolist() - - skip_sampler_cpu_output = ( - frozen_model_input.sampling_metadata.skip_sampler_cpu_output) - - # *Don't* skip logprobs pythonization *if*: - # * Any requests require logprobs to be returned in this - # iteration AND - # * These requests are being scheduled in a fashion which - # defers pythonization (i.e. multi-step scheduling.) - do_pythonize_logprobs = (skip_sampler_cpu_output - and any_logprobs_are_requested) - ( - prompt_logprobs, - sample_logprobs, - ) = (deferred_pythonize_logprobs(output, sampling_metadata, - logprobs_tensor) - if do_pythonize_logprobs else (None, None)) - - for sgdx, (seq_group, - sample_result) in enumerate(zip(seq_groups, samples_list)): - # Reminder: Please update docs/features/compatibility_matrix.md - # If the feature combo become valid - # (Check for Guided Decoding) - if seq_group.sampling_params.logits_processors: - assert len(seq_group.sampling_params.logits_processors) == 0, ( - "Logits Processors are not supported in multi-step decoding") - - if do_pythonize_logprobs: - assert prompt_logprobs is not None - assert sample_logprobs is not None - - ( - group_prompt_logprobs, - group_sample_logprobs, - ) = ( # Utilize deferred pythonization results - prompt_logprobs[sgdx], - sample_logprobs[sgdx], - ) - elif any_logprobs_are_requested: - ( - group_prompt_logprobs, - group_sample_logprobs, - ) = ( - # profile_run: use already-computed logprobs - output.outputs[sgdx].prompt_logprobs, - [sample.logprobs for sample in output.outputs[sgdx].samples]) - - seq_ids = seq_group.seq_ids - next_token_ids = sample_result - parent_ids = [0] - seq_outputs: List[SequenceOutput] - - if cache is not None: - completion_seq_group_output: CompletionSequenceGroupOutput = \ - cache.cached_completion_seq_group_output.get_object() - completion_seq_group_output.samples.clear() - seq_outputs = completion_seq_group_output.samples - else: - seq_outputs = [] - - for tdx, (parent_id, - next_token_id) in enumerate(zip(parent_ids, next_token_ids)): - if cache is not None: - seq_output: SequenceOutput = cache.cached_seq_output.get_object( - ) - seq_output.parent_seq_id = seq_ids[parent_id] - seq_output.output_token = next_token_id - - if any_logprobs_are_requested: - seq_output.logprobs = group_sample_logprobs[tdx] - else: - logprobs = next(iter(seq_output.logprobs.values())) - seq_output.logprobs.clear() - - logprobs.logprob = float('inf') - logprobs.rank = None - logprobs.decoded_token = None - - seq_output.logprobs[next_token_id] = logprobs - - seq_outputs.append(seq_output) - - else: - seq_outputs.append( - SequenceOutput(seq_ids[parent_id], next_token_id, - (group_sample_logprobs[tdx] - if any_logprobs_are_requested else { - next_token_id: - Logprob(logprob=float('inf'), - rank=None, - decoded_token=None) - }))) - if cache is not None: - completion_seq_group_output.prompt_logprobs = \ - group_prompt_logprobs if any_logprobs_are_requested else None - output.outputs.append(completion_seq_group_output) - else: - output.outputs.append( - CompletionSequenceGroupOutput( - seq_outputs, (group_prompt_logprobs - if any_logprobs_are_requested else None))) - - assert len(output.outputs) > 0 diff --git a/vllm/worker/multi_step_neuron_model_runner.py b/vllm/worker/multi_step_neuron_model_runner.py deleted file mode 100644 index 25f588077c..0000000000 --- a/vllm/worker/multi_step_neuron_model_runner.py +++ /dev/null @@ -1,84 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from importlib.util import find_spec -from typing import List, Optional - -import torch - -from vllm.config import VllmConfig -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.multimodal import MultiModalKwargs -from vllm.sequence import IntermediateTensors -from vllm.worker.neuron_model_runner import (ModelInputForNeuron, - NeuronModelRunner) - - -class MultiStepNeuronModelRunner(NeuronModelRunner): - """A model runner for multi step decoding using the transformers_neuronx - framework""" - - def __init__( - self, - vllm_config: VllmConfig, - ): - super().__init__(vllm_config) - self.speculation_config = self.speculative_config - from transformers_neuronx.config import GenerationConfig - self.speculation_config.draft_model_config.neuron_sampling_params = ( - GenerationConfig( - max_length=self.scheduler_config.max_model_len, - do_sample=True, - per_batch_line=True, - top_k=[self._MAX_NEURON_SAMPLING_TOP_K] \ - * self.scheduler_config.max_num_seqs, - top_p=[1.0] * self.scheduler_config.max_num_seqs, - temperature=[1.0] * self.scheduler_config.max_num_seqs, - dynamic=True, - global_top_k=self._MAX_NEURON_SAMPLING_TOP_K - )) - - def load_model(self) -> None: - if find_spec("transformers_neuronx") is not None: - from vllm.model_executor.model_loader.neuron import ( - get_neuron_eagle_speculation_model, - get_neuron_speculation_model) - if self.speculation_config.speculative_token_tree is not None: - self.model = get_neuron_eagle_speculation_model( - self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - speculation_config=self.speculation_config) - else: - self.model = get_neuron_speculation_model( - self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - speculation_config=self.speculation_config) - else: - raise NotImplementedError( - "Supports only Transformer-NeuronX based models.") - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelInputForNeuron, - kv_caches: Optional[List[torch.Tensor]] = None, - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[List[SamplerOutput]]: - logits = self.model( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - input_block_ids=model_input.input_block_ids, - **MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs or {}, - device=self.device, - ), - ) - - output = self.model.sample( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - return output diff --git a/vllm/worker/multi_step_neuronx_distributed_model_runner.py b/vllm/worker/multi_step_neuronx_distributed_model_runner.py deleted file mode 100644 index dd521dd67d..0000000000 --- a/vllm/worker/multi_step_neuronx_distributed_model_runner.py +++ /dev/null @@ -1,63 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import List, Optional - -import torch - -from vllm.config import VllmConfig -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.multimodal import MultiModalKwargs -from vllm.sequence import IntermediateTensors -from vllm.worker.neuronx_distributed_model_runner import ( - NeuronxDistributedModelRunner) - - -class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner): - """A model runner for multi-step decoding using the - neuronx-distributed-inference framework""" - - def __init__( - self, - vllm_config: VllmConfig, - ): - super().__init__(vllm_config) - - def load_model(self) -> None: - from vllm.model_executor.model_loader.neuronx_distributed import ( - get_neuron_speculation_model) - self.model = get_neuron_speculation_model( - self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - speculation_config=self.speculative_config) - - @torch.inference_mode() - def execute_model( - self, - model_input, - kv_caches: Optional[List[torch.Tensor]] = None, - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[List[SamplerOutput]]: - sampling_params = torch.tensor([[ - seq_group.sampling_params.top_k, - seq_group.sampling_params.top_p, - seq_group.sampling_params.temperature, - ] for seq_group in model_input.sampling_metadata.seq_groups]) - - logits = self.model( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - input_block_ids=model_input.input_block_ids, - sampling_params=sampling_params, - **MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs or {}, - device=self.device, - ), - ) - - output = self.model.sample( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - return output diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py deleted file mode 100644 index ea16e14f9e..0000000000 --- a/vllm/worker/multi_step_worker.py +++ /dev/null @@ -1,197 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple - -import torch - -from vllm.distributed import broadcast_tensor_dict, get_pp_group -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest -from vllm.worker.model_runner_base import BroadcastableModelInput -from vllm.worker.multi_step_model_runner import (MultiStepModelRunner, - StatefulModelInput) -from vllm.worker.worker import Worker, WorkerInput - - -@dataclass -class MultiStepState: - worker_input: WorkerInput - model_input: StatefulModelInput - - -class MultiStepWorker(Worker): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - base_model_runner = self.model_runner - # for multi-step model, wrap the model runner with MultiStepModelRunner - self.model_runner = MultiStepModelRunner( - base_model_runner, - vllm_config=base_model_runner.vllm_config, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=base_model_runner.is_driver_worker, - ) - - pipeline_parallel_size = self.parallel_config.pipeline_parallel_size - self.multi_step_states: List[ - Optional[MultiStepState]] = [None] * pipeline_parallel_size - self.temp_output = None - - def _get_driver_input_and_broadcast( - self, execute_model_req: ExecuteModelRequest - ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]: - """ - Get the driver input and broadcast it to other workers. - """ - assert self.is_driver_worker - virtual_engine = execute_model_req.virtual_engine - is_first_multi_step = execute_model_req.is_first_multi_step - if is_first_multi_step: - # on first step we prepare the worker input and model input normally - worker_input: WorkerInput = self.prepare_worker_input( - execute_model_req=execute_model_req) - model_input: StatefulModelInput = ( - self.model_runner.prepare_model_input( - execute_model_req.seq_group_metadata_list, - execute_model_req.virtual_engine, - execute_model_req.finished_requests_ids)) - - if execute_model_req.async_callback: - model_input.frozen_model_input = dataclasses.replace( # type: ignore - model_input.frozen_model_input, - async_callback=execute_model_req.async_callback) - else: - # on subsequent steps we reuse the worker input and model input - multi_step_state = self.multi_step_states[virtual_engine] - worker_input = multi_step_state.worker_input - model_input = multi_step_state.model_input - frozen_model_input = model_input.frozen_model_input - assert frozen_model_input is not None - assert frozen_model_input.attn_metadata is not None - # clear the cached metadata so that it can be recomputed on - # the workers. - frozen_model_input.attn_metadata._cached_prefill_metadata = None - frozen_model_input.attn_metadata._cached_decode_metadata = None - - model_input.is_first_multi_step = is_first_multi_step - model_input.is_last_step = execute_model_req.is_last_step - - if not is_first_multi_step: - # we broadcast the last sampled token ids to all TP workers so they - # can update their model input metadata in-place. - self._prepare_last_sampled_token_ids_for_tp_workers( - execute_model_req=execute_model_req, model_input=model_input) - - if self.do_metadata_broadcast: - broadcast_data = worker_input.as_broadcastable_tensor_dict() - broadcast_data.update(model_input.as_broadcastable_tensor_dict()) - broadcast_tensor_dict(broadcast_data, src=0) - - # Retuning empty dict here to keep this compatible with - # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast` - return model_input, worker_input, {} - - def _prepare_last_sampled_token_ids_for_tp_workers( - self, - execute_model_req: ExecuteModelRequest, - model_input: StatefulModelInput, - ) -> None: - """ - Prepare the last sampled token ids for TP workers. If it's the last - PP rank, then the last sampled token ids are already in the model_input. - If it is NOT the last PP rank, then we need to get the last sampled - token that is cached in the execute_model_req. - """ - if get_pp_group().is_last_rank: - assert model_input.cached_outputs[ - -1].sampler_output.sampled_token_ids is None - assert model_input.cached_outputs[-1].sampled_token_ids is not None - model_input.last_sampled_token_ids = model_input.cached_outputs[ - -1].sampled_token_ids - # free sampled token ids from the previous step if it has been - # pythonized. Cannot free the last sampled token ids because - # we need it for GPU advance_step. - for output in model_input.cached_outputs[:-1]: - if output.pythonized: - output.sampled_token_ids = None - else: - # otherwise we need to get the cached sampled token ids from the - # execute_model_req - assert execute_model_req.last_sampled_token_ids is not None - model_input.last_sampled_token_ids = ( - execute_model_req.last_sampled_token_ids.cuda()) - model_input.add_sampler_output( - SamplerOutput(outputs=[], sampled_token_ids=None), - model_input.last_sampled_token_ids) - - # free sampled token ids from the previous step. - # TODO(will) we could reuse the sampled token ids tensor from - # the previous step instead. - for output in model_input.cached_outputs[:-1]: - output.sampled_token_ids = None - assert model_input.cached_outputs[-1].sampled_token_ids is not None - - def prepare_input( - self, - execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[Tuple[StatefulModelInput, WorkerInput, Dict[str, - torch.Tensor]]]: - """ - Depending on the current state of the request and multi step worker, - this method may skip the normal _prepare_model_input and - _prepare_worker_input methods and instead used cached values. - """ - if self.is_driver_worker: - if execute_model_req is None: - if self.do_metadata_broadcast: - # This signals that there's no more requests to process for - # now. All workers are running infinite loop with - # broadcast_tensor_dict, and it stops the loop when the - # driver broadcasts an empty input. Send an empty input to - # notify all other workers to stop their execution loop. - broadcast_tensor_dict({}, src=0) - return None - - virtual_engine = execute_model_req.virtual_engine - (model_input, worker_input, - kwargs) = self._get_driver_input_and_broadcast(execute_model_req) - assert isinstance(model_input, StatefulModelInput) - if execute_model_req.is_first_multi_step: - # cache the worker input and model input for the next steps - self.multi_step_states[virtual_engine] = MultiStepState( - worker_input=worker_input, model_input=model_input) - # if TP workers - else: - broadcast_data = self._get_worker_input_from_broadcast() - # if the driver has sent an empty input, we should stop the worker - # loop - if broadcast_data is None: - return None - model_input, worker_input, kwargs = broadcast_data - assert isinstance(model_input, StatefulModelInput) - virtual_engine = worker_input.virtual_engine - if model_input.is_first_multi_step: - pass - # TODO(will) Can cache the worker input and model input for the - # next steps. See below for details - else: - # TODO(will) possible to also cache and reuse the cached worker - # input and model input. The idea is essentially the delta - # optimization for model_inputs. Where the TP workers can cache - # the model input states and we only broadcast the delta need - # for the next step (sampled_token_ids from the previous step) - - assert isinstance(model_input, StatefulModelInput) - # we need to update the last sampled token ids in the model - # input for the workers so that they can run inplace - # advance_step - model_input.add_sampler_output( - SamplerOutput(outputs=[], sampled_token_ids=None), - model_input.last_sampled_token_ids) - - assert model_input is not None - assert worker_input is not None - return model_input, worker_input, kwargs diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 4e1408300f..3e4512a639 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -64,25 +64,21 @@ class NeuronWorker(LocalOrDistributedWorkerBase): assert (self.lora_config is None), ("LoRA is not supported for TransformersNeuronX " "framework.") - from vllm.worker.multi_step_neuron_model_runner import ( - MultiStepNeuronModelRunner) if self.speculative_config is not None: - return MultiStepNeuronModelRunner(vllm_config=vllm_config) - else: - return NeuronModelRunner(vllm_config=vllm_config) + raise NotImplementedError( + "Speculative decoding is not supported for TransformersNeuronX" + ) + return NeuronModelRunner(vllm_config=vllm_config) def get_neuronx_distributed_model_runner(self, vllm_config): - from vllm.worker.multi_step_neuronx_distributed_model_runner import ( - MultiStepNeuronxDistributedModelRunner) from vllm.worker.neuronx_distributed_model_runner import ( NeuronxDistributedModelRunner) if self.speculative_config is not None: - assert (self.lora_config - is None), "LoRA is not supported for Speculative Decoding" - return MultiStepNeuronxDistributedModelRunner( - vllm_config=vllm_config) - else: - return NeuronxDistributedModelRunner(vllm_config=vllm_config) + assert (self.lora_config is None), ( + "LoRA is not supported for Speculative Decoding") + raise NotImplementedError( + "Speculative decoding is not supported for NeuronxDistributed") + return NeuronxDistributedModelRunner(vllm_config=vllm_config) def init_device(self) -> None: self.init_distributed_environment() From d31f97cf57839b71cc182c6547a87278aa32d8cb Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 12 Aug 2025 20:21:18 -0700 Subject: [PATCH 217/932] [Misc] Remove tests/multi_step/__init__.py (#22778) Signed-off-by: Woosuk Kwon --- tests/multi_step/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/multi_step/__init__.py diff --git a/tests/multi_step/__init__.py b/tests/multi_step/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 From c5830381afbef44023ec1c97ae61ff02f22b1f9a Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 12 Aug 2025 20:38:18 -0700 Subject: [PATCH 218/932] [V0 Deprecation] Remove args for multi-step scheduling (#22779) Signed-off-by: Woosuk Kwon --- tests/utils_/test_utils.py | 1 - vllm/config/scheduler.py | 27 +-------------------------- 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index 8be1e103dc..084d82dee1 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -161,7 +161,6 @@ def parser_with_config(): parser.add_argument('--port', type=int) parser.add_argument('--tensor-parallel-size', type=int) parser.add_argument('--trust-remote-code', action='store_true') - parser.add_argument('--multi-step-stream-outputs', action=StoreBoolean) return parser diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index db669600a0..9300201279 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -115,12 +115,6 @@ class SchedulerConfig: (e.g., beam search), recomputation is not currently supported. In such a case, we use swapping instead.""" - num_scheduler_steps: int = 1 - """Maximum number of forward steps per scheduler call.""" - - multi_step_stream_outputs: bool = True - """If False, then multi-step will stream outputs at the end of all steps""" - send_delta_data: bool = False """Private API. If used, scheduler sends delta data to workers instead of an entire data. It should be enabled only @@ -193,16 +187,7 @@ class SchedulerConfig: if self.max_num_batched_tokens is None: if self.enable_chunked_prefill: - if self.num_scheduler_steps > 1: - # Multi-step Chunked-Prefill doesn't allow prompt-chunking - # for now. Have max_num_batched_tokens set to max_model_len - # so we don't reject sequences on account of a short - # max_num_batched_tokens. - self.max_num_batched_tokens = max( - self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) - else: - self.max_num_batched_tokens = ( - DEFAULT_MAX_NUM_BATCHED_TOKENS) + self.max_num_batched_tokens = DEFAULT_MAX_NUM_BATCHED_TOKENS else: # If max_model_len is too short, use # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value @@ -293,12 +278,6 @@ class SchedulerConfig: f"({self.num_lookahead_slots}) must be greater than or " "equal to 0.") - if self.num_scheduler_steps < 1: - raise ValueError( - "num_scheduler_steps " - f"({self.num_scheduler_steps}) must be greater than or " - "equal to 1.") - if self.max_num_partial_prefills < 1: raise ValueError( f"max_num_partial_prefills ({self.max_num_partial_prefills}) " @@ -323,7 +302,3 @@ class SchedulerConfig: f"max_num_partial_prefills ({self.max_num_partial_prefills}).") return self - - @property - def is_multi_step(self) -> bool: - return self.num_scheduler_steps > 1 From 4f0f844b1675419fd2171bc5e981a82386ec552b Mon Sep 17 00:00:00 2001 From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com> Date: Wed, 13 Aug 2025 12:21:50 +0800 Subject: [PATCH 219/932] Fix cuda illegal mem access with Llama4 TP8 + rms_norm custom op (#22701) Signed-off-by: Po-Han Huang --- vllm/model_executor/models/llama4.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 1f8b9d0744..308cb3e85e 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -224,10 +224,14 @@ class Llama4Attention(nn.Module): if self.rotary_emb is not None: q, k = self.rotary_emb(positions, q, k) + if self.qk_norm is not None: - q = q.reshape(-1, self.num_heads, self.head_dim) + # Normalization is applied on the head_dim dimension. The rest of + # the dimensions are collapsed into a single dimension to support + # custom rms_norm cuda kernel. + q = q.reshape(-1, self.head_dim) q = self.qk_norm(q.float()).reshape(-1, self.q_size).to(q.dtype) - k = k.reshape(-1, self.num_kv_heads, self.head_dim) + k = k.reshape(-1, self.head_dim) k = self.qk_norm(k.float()).reshape(-1, self.kv_size).to(k.dtype) # We are applying temperature tuning (https://arxiv.org/abs/2501.19399) From b1361c7273f60ca244e5425bdb7a9120057327fe Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 13 Aug 2025 00:22:05 -0400 Subject: [PATCH 220/932] [Bugfix] Fix default enable for CUTLASS MLA on SM100 (#22738) Signed-off-by: mgoin --- vllm/platforms/cuda.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 7095913157..63f6b373c3 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -152,6 +152,9 @@ class CudaPlatformBase(Platform): if cls.is_device_capability(100): # Blackwell => Force CutlassMLA. use_cutlass_mla = True + # TODO: This does not work, because the + # global_force_attn_backend_context_manager is not set. + # See vllm/attention/selector.py:_cached_get_attn_backend envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA" else: # Not Blackwell @@ -217,7 +220,9 @@ class CudaPlatformBase(Platform): if use_mla: # TODO(lucas): refactor to be more concise # we should probably consider factoring out V1 here - if selected_backend == _Backend.CUTLASS_MLA: + if selected_backend == _Backend.CUTLASS_MLA or ( + cls.is_device_capability(100) and selected_backend is None + and block_size == 128): if use_v1: logger.info_once("Using Cutlass MLA backend on V1 engine.") return ("vllm.v1.attention.backends.mla." From c6b928798e96f0a99a666945686c63b61bbbced4 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 13 Aug 2025 00:22:16 -0400 Subject: [PATCH 221/932] Force TRTLLM attention for gpt-oss on SM100 (#22678) Signed-off-by: mgoin --- vllm/model_executor/models/gpt_oss.py | 5 +---- vllm/utils/flashinfer.py | 8 ++++++++ vllm/v1/attention/backends/flashinfer.py | 11 +++++++---- vllm/v1/attention/backends/utils.py | 5 ++++- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 6a65bbbe2e..7c7712dbe1 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -8,7 +8,6 @@ import torch.distributed as dist from torch import nn from transformers import GptOssConfig -from vllm import envs from vllm.attention import Attention, AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig @@ -70,11 +69,9 @@ class OAIAttention(nn.Module): tp_size = get_tensor_model_parallel_world_size() - attention_sink_dtype = (torch.float32 if envs.VLLM_USE_TRTLLM_ATTENTION - else torch.bfloat16) self.sinks = torch.nn.Parameter( torch.empty(config.num_attention_heads // tp_size, - dtype=attention_sink_dtype, + dtype=torch.bfloat16, requires_grad=False)) self.norm = RMSNorm(config.hidden_size, eps=1e-5) diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 5998d4c312..6b23ed4268 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -154,6 +154,7 @@ def use_trtllm_attention( num_qo_heads: Optional[int], num_kv_heads: Optional[int], attn_head_size: Optional[int], + has_sinks: bool = False, ) -> bool: # Requires SM100 and NVIDIA artifactory to be accessible to download cubins if not (current_platform.is_device_capability(100) @@ -165,6 +166,13 @@ def use_trtllm_attention( or num_qo_heads % num_kv_heads != 0): return False + # If sinks are being used, we must use TRTLLM attention as it's + # the only backend that supports them + if has_sinks: + logger.info_once( + "Using TRTLLM attention (required for attention sinks).") + return True + env_value = envs.VLLM_USE_TRTLLM_ATTENTION if env_value is not None: logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index c85d8bce31..12e5542d69 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -523,14 +523,17 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): num_kv_heads = self.kv_cache_spec.num_kv_heads head_dim = self.kv_cache_spec.head_size + # Check if any layer uses sinks (requires TRTLLM attention) + has_sinks = self.global_hyperparameters.has_sinks + # currently prefill trtllm attention does not support fp8 kv cache prefill_use_trtllm = not cache_dtype.startswith("fp8") \ and use_trtllm_attention( num_prefill_tokens, max_seq_len, cache_dtype, - num_qo_heads, num_kv_heads, head_dim) + num_qo_heads, num_kv_heads, head_dim, has_sinks) decode_use_trtllm = use_trtllm_attention( num_decode_tokens, max_seq_len, cache_dtype, - num_qo_heads, num_kv_heads, head_dim) + num_qo_heads, num_kv_heads, head_dim, has_sinks) attn_metadata = FlashInferMetadata( num_actual_tokens=num_actual_tokens, @@ -642,9 +645,9 @@ class FlashInferImpl(AttentionImpl): f"heads in the layer. Expected {num_heads}, but got " f"{sinks.shape[0]}." ) + # Cast sinks to float32 if needed (FlashInfer requirement) if sinks.dtype != torch.float32: - raise ValueError("Sinks must be of type float32, but got " - f"{sinks.dtype}.") + sinks = sinks.to(torch.float32) self.sinks = sinks def forward( diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index e23dd8bc5b..91eb84245a 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -285,6 +285,7 @@ class PerLayerParameters: window_left: int logits_soft_cap: Optional[float] sm_scale: float + has_sinks: bool = False def get_per_layer_parameters( @@ -307,9 +308,11 @@ def get_per_layer_parameters( window_left = window_size[0] if window_size is not None else -1 logits_soft_cap = getattr(impl, "logits_soft_cap", None) sm_scale = impl.scale + has_sinks = getattr(impl, "sinks", None) is not None per_layer_params[key] = PerLayerParameters(window_left, - logits_soft_cap, sm_scale) + logits_soft_cap, sm_scale, + has_sinks) return per_layer_params From 4082338a25851e1f923ad5601616f2717536c6fd Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 13 Aug 2025 00:26:38 -0400 Subject: [PATCH 222/932] Remove unneeded ROCm platform import when using CUDA (#22765) Signed-off-by: mgoin --- vllm/attention/backends/rocm_flash_attn.py | 2 +- vllm/attention/ops/chunked_prefill_paged_decode.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 1ee1dea729..da3d9ff328 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -22,7 +22,6 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape) from vllm.platforms import current_platform -from vllm.platforms.rocm import use_rocm_custom_paged_attention if TYPE_CHECKING: from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata @@ -886,6 +885,7 @@ class ROCmFlashAttentionImpl(AttentionImpl): num_seqs, num_heads, head_size = decode_query.shape block_size = value_cache.shape[3] gqa_ratio = num_heads // self.num_kv_heads + from vllm.platforms.rocm import use_rocm_custom_paged_attention use_custom = use_rocm_custom_paged_attention( decode_query.dtype, head_size, block_size, gqa_ratio, decode_meta.max_decode_seq_len, self.sliding_window, diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py index dc10d7eca9..e5b90a8b27 100644 --- a/vllm/attention/ops/chunked_prefill_paged_decode.py +++ b/vllm/attention/ops/chunked_prefill_paged_decode.py @@ -11,7 +11,6 @@ import torch from vllm import _custom_ops as ops from vllm.platforms import current_platform -from vllm.platforms.rocm import use_rocm_custom_paged_attention from vllm.triton_utils import tl, triton from .prefix_prefill import context_attention_fwd @@ -296,6 +295,7 @@ def chunked_prefill_paged_decode( num_queries_per_kv_padded = max(triton.next_power_of_2(num_queries_per_kv), 16) + from vllm.platforms.rocm import use_rocm_custom_paged_attention use_custom = use_rocm_custom_paged_attention( query.dtype, head_size, From 77a6bf07aedf132aad2b6719f6d87abc5d3311ab Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 13 Aug 2025 00:31:47 -0400 Subject: [PATCH 223/932] [Bug] Fix Unexpected Keyword Argument 'w1_bias' (#22757) Signed-off-by: yewentao256 --- vllm/model_executor/layers/fused_moe/layer.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index fb38fb91ea..8ef0a805d8 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -475,12 +475,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): activation=activation, apply_router_weight_on_input=apply_router_weight_on_input) else: - return self.fused_experts( + # add w1_bias/w2_bias to kwargs if they exist + kwargs = dict( hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, - w1_bias=layer.w13_bias if self.has_bias else None, - w2_bias=layer.w2_bias if self.has_bias else None, topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, @@ -489,6 +488,17 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): global_num_experts=global_num_experts, expert_map=expert_map, ) + if isinstance(self.fused_experts, + FusedMoEModularKernel) and self.has_bias: + raise ValueError( + "FusedMoEModularKernel does not support bias.") + if self.has_bias: + kwargs.update({ + "w1_bias": getattr(layer, "w13_bias", None), + "w2_bias": getattr(layer, "w2_bias", None), + }) + + return self.fused_experts(**kwargs) def forward_cpu( self, From 4c558cf62ed69fbd8c031809b0a7f8b12afa980b Mon Sep 17 00:00:00 2001 From: shixianc <49539556+shixianc@users.noreply.github.com> Date: Tue, 12 Aug 2025 21:34:47 -0700 Subject: [PATCH 224/932] [Perf] Support topk softmax fused kernel for broader num_experts (#22211) Signed-off-by: Shixian Cui Co-authored-by: Shixian Cui --- csrc/moe/topk_softmax_kernels.cu | 77 +++++++++++++++++++------------- tests/kernels/moe/test_moe.py | 2 +- 2 files changed, 46 insertions(+), 33 deletions(-) diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index 7a7865b901..946c137db6 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -188,7 +188,9 @@ __launch_bounds__(TPB) __global__ void moeTopK( It fuses the softmax, max and argmax into a single kernel. Limitations: - 1) This implementation is intended for when the number of experts is a small power of 2. + 1) This implementation is optimized for when the number of experts is a small power of 2. + Additionally it also supports when number of experts is multiple of 64 which is still + faster than the computing softmax and topK separately (only tested on CUDA yet). 2) This implementation assumes k is small, but will work for any k. */ @@ -198,8 +200,6 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__ int* source_rows, const int k, const int start_expert, const int end_expert) { // We begin by enforcing compile time assertions and setting up compile time constants. - static_assert(VPT == (VPT & -VPT), "VPT must be power of 2"); - static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2"); static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2"); static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16"); @@ -407,12 +407,10 @@ struct TopkConstants }; } // namespace detail -template +template void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices, int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream) { - static constexpr std::size_t MAX_BYTES_PER_LDG = 16; - static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS); using Constants = detail::TopkConstants; static constexpr int VPT = Constants::VPT; @@ -425,21 +423,12 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert); } -#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \ - switch (warpSize) { \ - case 32: \ - topkGatingSoftmaxLauncherHelper( \ - gating_output, nullptr, topk_weights, topk_indices, \ - token_expert_indices, num_tokens, topk, 0, num_experts, stream); \ - break; \ - case 64: \ - topkGatingSoftmaxLauncherHelper( \ - gating_output, nullptr, topk_weights, topk_indices, \ - token_expert_indices, num_tokens, topk, 0, num_experts, stream); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported warp size: ", warpSize); \ - } +#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \ + static_assert(WARP_SIZE == 32 || WARP_SIZE == 64, \ + "Unsupported warp size. Only 32 and 64 are supported."); \ + topkGatingSoftmaxLauncherHelper( \ + gating_output, nullptr, topk_weights, topk_indices, \ + token_expert_indices, num_tokens, topk, 0, num_experts, stream); template void topkGatingSoftmaxKernelLauncher( @@ -453,38 +442,62 @@ void topkGatingSoftmaxKernelLauncher( const int topk, cudaStream_t stream) { static constexpr int WARPS_PER_TB = 4; - auto warpSize = WARP_SIZE; + static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16; + static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8; switch (num_experts) { case 1: - LAUNCH_SOFTMAX(1, WARPS_PER_TB); + LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); break; case 2: - LAUNCH_SOFTMAX(2, WARPS_PER_TB); + LAUNCH_SOFTMAX(2, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); break; case 4: - LAUNCH_SOFTMAX(4, WARPS_PER_TB); + LAUNCH_SOFTMAX(4, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); break; case 8: - LAUNCH_SOFTMAX(8, WARPS_PER_TB); + LAUNCH_SOFTMAX(8, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); break; case 16: - LAUNCH_SOFTMAX(16, WARPS_PER_TB); + LAUNCH_SOFTMAX(16, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); break; case 32: - LAUNCH_SOFTMAX(32, WARPS_PER_TB); + LAUNCH_SOFTMAX(32, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); break; case 64: - LAUNCH_SOFTMAX(64, WARPS_PER_TB); + LAUNCH_SOFTMAX(64, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); break; case 128: - LAUNCH_SOFTMAX(128, WARPS_PER_TB); + LAUNCH_SOFTMAX(128, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); break; case 256: - LAUNCH_SOFTMAX(256, WARPS_PER_TB); + LAUNCH_SOFTMAX(256, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); break; + case 512: + LAUNCH_SOFTMAX(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); + break; + // (CUDA only) support multiples of 64 when num_experts is not power of 2. + // ROCm uses WARP_SIZE 64 so 8 bytes loading won't fit for some of num_experts, + // alternatively we can test 4 bytes loading and enable it in future. +#ifndef USE_ROCM + case 192: + LAUNCH_SOFTMAX(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64); + break; + case 320: + LAUNCH_SOFTMAX(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64); + break; + case 384: + LAUNCH_SOFTMAX(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64); + break; + case 448: + LAUNCH_SOFTMAX(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64); + break; + case 576: + LAUNCH_SOFTMAX(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64); + break; +#endif default: { TORCH_CHECK(softmax_workspace != nullptr, - "softmax_workspace must be provided for num_experts that are not a power of 2."); + "softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64."); static constexpr int TPB = 256; moeSoftmax<<>>( gating_output, nullptr, softmax_workspace, num_experts); diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 0f1c787046..49c097718e 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -36,7 +36,7 @@ from vllm.model_executor.models.mixtral import MixtralMoE from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types -NUM_EXPERTS = [8, 64] +NUM_EXPERTS = [8, 64, 192] EP_SIZE = [1, 4] TOP_KS = [2, 6] From 6807af8f46acd184f99342ff38f2a1359f693b10 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Tue, 12 Aug 2025 21:37:26 -0700 Subject: [PATCH 225/932] [gpt-oss] upgrade gpt-oss to v0.0.3 and add version check (#22768) Signed-off-by: Chen Zhang --- vllm/entrypoints/tool.py | 51 ++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/vllm/entrypoints/tool.py b/vllm/entrypoints/tool.py index 723cff91d4..758789a5e0 100644 --- a/vllm/entrypoints/tool.py +++ b/vllm/entrypoints/tool.py @@ -2,9 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, Optional - -from openai_harmony import Message +from typing import TYPE_CHECKING, Any from vllm.logger import init_logger @@ -15,6 +13,30 @@ if TYPE_CHECKING: logger = init_logger(__name__) +def validate_gpt_oss_install(): + """ + Check if the gpt-oss is installed and its version is at least 0.0.3. + If not, raise an ImportError. + """ + from importlib.metadata import PackageNotFoundError, version + + from packaging.version import InvalidVersion, Version + + try: + pkg_version_str = version("gpt_oss") # e.g., "0.0.5" + pkg_version = Version(pkg_version_str) + except PackageNotFoundError: + raise ImportError("Package 'gpt_oss' is not installed.") from None + except InvalidVersion as e: + raise ImportError( + f"Invalid version string for 'gpt_oss': {e}") from None + + if pkg_version < Version("0.0.3"): + raise ImportError( + f"gpt_oss >= 0.0.3 is required, but {pkg_version} is installed." + ) from None + + class Tool(ABC): @abstractmethod @@ -33,12 +55,14 @@ class HarmonyBrowserTool(Tool): return try: + validate_gpt_oss_install() from gpt_oss.tools.simple_browser import SimpleBrowserTool from gpt_oss.tools.simple_browser.backend import ExaBackend - except ImportError: + except ImportError as e: self.enabled = False logger.warning_once( - "gpt_oss is not installed, browsing is disabled") + "gpt_oss is not installed properly (%s), browsing is disabled", + e) return browser_backend = ExaBackend(source="web", api_key=exa_api_key) @@ -65,23 +89,16 @@ class HarmonyPythonTool(Tool): self.enabled = True try: + validate_gpt_oss_install() from gpt_oss.tools.python_docker.docker_tool import PythonTool - except ImportError: + except ImportError as e: self.enabled = False logger.warning_once( - "gpt_oss is not installed, code interpreter is disabled") + "gpt_oss is not installed properly (%s), code interpreter is " + "disabled", e) return - # NOTE (Chen): as of gpt-oss 0.0.2, there is a bug in _make_response - # and we do the following monkey patch to fix it. - class PatchedGptOssPythonTool(PythonTool): - - def _make_response(self, - output: str, - channel: Optional[str] = None) -> Message: - return super()._make_response(output) - - self.python_tool = PatchedGptOssPythonTool() + self.python_tool = PythonTool() logger.info_once("Code interpreter tool initialized") async def get_result(self, context: "ConversationContext") -> Any: From d16aa3dae446d93f870a2e51b240e18a01cac294 Mon Sep 17 00:00:00 2001 From: zzh142857 Date: Wed, 13 Aug 2025 03:09:13 -0400 Subject: [PATCH 226/932] [Model] Add option to run Step3VisionEncoder in DP (#22697) Signed-off-by: zzh142857 --- vllm/model_executor/models/step3_vl.py | 132 +++++++++++++++++-------- 1 file changed, 91 insertions(+), 41 deletions(-) diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 41dba312cb..f1f38c01b7 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -21,6 +21,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, + ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler @@ -33,6 +34,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.multimodal.utils import run_dp_sharded_vision_model from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import Step3VisionEncoderConfig from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -650,7 +652,8 @@ class Step3VisionAttention(nn.Module): def __init__(self, config, quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + prefix: str = "", + use_data_parallel: bool = False): super().__init__() self.config = config self.embed_dim = config.hidden_size @@ -659,20 +662,42 @@ class Step3VisionAttention(nn.Module): self.scale = self.head_dim**-0.5 - tp_size = get_tensor_model_parallel_world_size() + tp_size = (1 if use_data_parallel else + get_tensor_model_parallel_world_size()) assert self.total_num_heads % tp_size == 0 self.num_heads = self.total_num_heads // tp_size - self.qkv_proj = QKVParallelLinear(self.embed_dim, - self.head_dim, - self.total_num_heads, - bias=True, - quant_config=quant_config, - prefix=prefix) - self.out_proj = RowParallelLinear(self.embed_dim, - self.embed_dim, - bias=True, - quant_config=quant_config, - prefix=prefix) + + self.q_size = self.num_heads * self.head_dim + + if use_data_parallel: + self.qkv_proj = ReplicatedLinear( + self.embed_dim, + 3 * self.q_size, + bias=True, + quant_config=quant_config, + prefix=prefix, + ) + self.out_proj = ReplicatedLinear( + self.total_num_heads * self.head_dim, + self.embed_dim, + bias=True, + quant_config=quant_config, + prefix=prefix, + ) + else: + self.qkv_proj = QKVParallelLinear( + self.embed_dim, + self.head_dim, + self.total_num_heads, + bias=True, + quant_config=quant_config, + prefix=prefix, + ) + self.out_proj = RowParallelLinear(self.embed_dim, + self.embed_dim, + bias=True, + quant_config=quant_config, + prefix=prefix) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, @@ -712,20 +737,25 @@ class Step3VisionMLP(nn.Module): def __init__(self, config, quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + prefix: str = "", + use_data_parallel: bool = False): super().__init__() self.config = config self.activation_fn = get_act_fn(config.hidden_act) - self.fc1 = ColumnParallelLinear(config.hidden_size, - config.intermediate_size, - bias=True, - quant_config=quant_config, - prefix=prefix) - self.fc2 = RowParallelLinear(config.intermediate_size, - config.hidden_size, - bias=True, - quant_config=quant_config, - prefix=prefix) + cls_fc1 = (ReplicatedLinear + if use_data_parallel else ColumnParallelLinear) + self.fc1 = cls_fc1(config.hidden_size, + config.intermediate_size, + bias=True, + quant_config=quant_config, + prefix=prefix) + cls_fc2 = (ReplicatedLinear + if use_data_parallel else RowParallelLinear) + self.fc2 = cls_fc2(config.intermediate_size, + config.hidden_size, + bias=True, + quant_config=quant_config, + prefix=prefix) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.fc1(hidden_states) @@ -739,15 +769,22 @@ class Step3VisionEncoderLayer(nn.Module): def __init__(self, config: Step3VisionEncoderConfig, quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + prefix: str = "", + use_data_parallel: bool = False): super().__init__() + self.use_data_parallel = use_data_parallel self.embed_dim = config.hidden_size - self.self_attn = Step3VisionAttention(config, - quant_config, - prefix=f"{prefix}.self_attn") + self.self_attn = Step3VisionAttention( + config, + quant_config, + prefix=f"{prefix}.self_attn", + use_data_parallel=self.use_data_parallel) self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - self.mlp = Step3VisionMLP(config, quant_config, prefix=f"{prefix}.mlp") + self.mlp = Step3VisionMLP(config, + quant_config, + prefix=f"{prefix}.mlp", + use_data_parallel=self.use_data_parallel) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) @@ -767,13 +804,16 @@ class Step3VisionEncoder(nn.Module): def __init__(self, config: Step3VisionEncoderConfig, quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + prefix: str = "", + use_data_parallel: bool = False): super().__init__() self.config = config + self.use_data_parallel = use_data_parallel self.layers = nn.ModuleList([ Step3VisionEncoderLayer(config, quant_config, - prefix=f"{prefix}.layers.{i}") + prefix=f"{prefix}.layers.{i}", + use_data_parallel=self.use_data_parallel) for i in range(config.num_hidden_layers) ]) @@ -792,21 +832,29 @@ class Step3VisionTransformer(nn.Module): def __init__(self, config: Step3VisionEncoderConfig, quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + prefix: str = "", + use_data_parallel: bool = False): super().__init__() self.config = config + self.use_data_parallel = use_data_parallel self.image_size = config.image_size self.embeddings = Step3VisionEmbeddings(config) - self.transformer = Step3VisionEncoder(config, - quant_config, - prefix=f"{prefix}.transformer") + self.transformer = Step3VisionEncoder( + config, + quant_config, + prefix=f"{prefix}.transformer", + use_data_parallel=self.use_data_parallel) def forward( self, pixel_values: torch.Tensor, ): hidden_states = self.embeddings(pixel_values) - hidden_states = self.transformer(inputs_embeds=hidden_states) + if self.use_data_parallel: + hidden_states = run_dp_sharded_vision_model( + hidden_states, self.transformer) + else: + hidden_states = self.transformer(inputs_embeds=hidden_states) return hidden_states @@ -836,13 +884,15 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, self.config = config self.multimodal_config = multimodal_config + self.use_data_parallel = (vllm_config.parallel_config. + enable_multimodal_encoder_data_parallel) if multimodal_config.get_limit_per_prompt("image"): - self.vision_model = Step3VisionTransformer(config.vision_config, - None, - prefix=maybe_prefix( - prefix, - "vision_model")) + self.vision_model = Step3VisionTransformer( + config.vision_config, + None, + prefix=maybe_prefix(prefix, "vision_model"), + use_data_parallel=self.use_data_parallel) self.vit_downsampler = nn.Conv2d( config.vision_config.hidden_size, config.vision_config.output_hidden_size, From 9e7e5baaa83b1e5070a3cf3823c134b28eaa2a1c Mon Sep 17 00:00:00 2001 From: Yuxuan Zhang <2448370773@qq.com> Date: Wed, 13 Aug 2025 16:23:33 +0800 Subject: [PATCH 227/932] [Model] Add missing prefix to glm4_1v (#22716) Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> --- vllm/model_executor/models/glm4_1v.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 2a89c03bfe..88c53c8363 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -453,25 +453,30 @@ class Glm4vPatchMerger(nn.Module): context_dim: int, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = d_model self.proj = ColumnParallelLinear(self.hidden_size, self.hidden_size, bias=bias, - gather_output=True) + gather_output=True, + quant_config=quant_config, + prefix=f"{prefix}.proj") self.post_projection_norm = nn.LayerNorm(self.hidden_size) self.gate_up_proj = MergedColumnParallelLinear( input_size=self.hidden_size, output_sizes=[context_dim] * 2, bias=bias, quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", ) self.down_proj = RowParallelLinear( context_dim, self.hidden_size, bias=bias, quant_config=quant_config, + prefix=f"{prefix}.down_proj", ) self.act_fn = SiluAndMul() self.extra_activation_func = nn.GELU() @@ -661,6 +666,7 @@ class Glm4vVisionTransformer(nn.Module): context_dim=vision_config.intermediate_size, quant_config=quant_config, bias=False, + prefix=f"{prefix}.merger", ) self.embeddings = Glm4vVisionEmbeddings(vision_config) From a01e0018b50fbda6aaf151268fd6f4769b6e81a8 Mon Sep 17 00:00:00 2001 From: Duc-Viet Hoang Date: Wed, 13 Aug 2025 17:11:36 +0700 Subject: [PATCH 228/932] [Bugfix] Fix Nemotron VL image processing (#22739) Co-authored-by: ducviet00-h2 --- .../multimodal/processing/test_nemotron_vl.py | 8 +- vllm/model_executor/models/nemotron_vl.py | 186 ++++++++++++++++++ 2 files changed, 190 insertions(+), 4 deletions(-) diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py index 3ce88bc427..6fbbab0d26 100644 --- a/tests/models/multimodal/processing/test_nemotron_vl.py +++ b/tests/models/multimodal/processing/test_nemotron_vl.py @@ -23,15 +23,15 @@ def _get_expected_num_patches( min_num: int, max_num: int, ): - from vllm.model_executor.models.internvl import ( - calculate_internvl_targets, get_internvl_target_ratios) + from vllm.model_executor.models.nemotron_vl import ( + calculate_nemotron_vl_targets, get_nemotron_vl_target_ratios) width, height = image.size - blocks, _, _ = calculate_internvl_targets( + blocks, _, _ = calculate_nemotron_vl_targets( orig_width=width, orig_height=height, - target_ratios=get_internvl_target_ratios( + target_ratios=get_nemotron_vl_target_ratios( min_num, max_num, ), diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index b90cb9b39a..82bcd06462 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -13,6 +13,7 @@ from typing import Optional import torch import torch.nn as nn +import torchvision.transforms as T from PIL import Image from transformers import AutoModel, PretrainedConfig from transformers.image_processing_utils_fast import BaseImageProcessorFast @@ -27,6 +28,7 @@ from vllm.model_executor.models.internvl import ( from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.processing import PromptUpdateDetails from vllm.sequence import IntermediateTensors @@ -44,6 +46,146 @@ IMG_END = '' IMG_CONTEXT = '' +def build_transform(input_size: int): + return T.Compose([ + T.Lambda(lambda img: convert_image_mode(img, 'RGB')), + T.Resize((input_size, input_size), + interpolation=T.InterpolationMode.BICUBIC), + T.ToTensor(), + ]) + + +# adapted from https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1 +def find_closest_aspect_ratio( + aspect_ratio: float, + target_ratios: list[tuple[int, int]], + *, + width: int, + height: int, + image_size: int, +) -> tuple[int, int]: + best_factor = float('-inf') + best_ratio = (1, 1) + area = width * height + + for rw, rh in target_ratios: + target_aspect_ratio = rw / rh + size_factor = min((rw * rh * image_size * image_size) / area, 0.6) + ratio_closeness = min(target_aspect_ratio / aspect_ratio, + aspect_ratio / target_aspect_ratio) + factor = size_factor * ratio_closeness + + if factor > best_factor: + best_factor = factor + best_ratio = (rw, rh) + + return best_ratio + + +def calculate_nemotron_vl_targets( + *, + orig_width: int, + orig_height: int, + target_ratios: list[tuple[int, int]], + image_size: int, + use_thumbnail: bool, +) -> tuple[int, int, int]: + aspect_ratio = orig_width / orig_height + + # find the closest aspect ratio to the target + target_aspect_ratio = find_closest_aspect_ratio( + aspect_ratio, + target_ratios, + width=orig_width, + height=orig_height, + image_size=image_size, + ) + + # calculate the target width and height + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + # add thumbnail image if num_blocks != 1 + if use_thumbnail and blocks != 1: + blocks += 1 + + return blocks, target_width, target_height + + +def dynamic_preprocess_nemotron_vl( + image: Image.Image, + *, + target_ratios: list[tuple[int, int]], + image_size: int, + use_thumbnail: bool, +) -> list[Image.Image]: + orig_width, orig_height = image.size + + # calculate the number of blocks without thumbnail + blocks, target_width, target_height = calculate_nemotron_vl_targets( + orig_width=orig_width, + orig_height=orig_height, + target_ratios=target_ratios, + image_size=image_size, + use_thumbnail=False, + ) + + # resize the image + resized_img = image.resize((target_width, target_height)) + processed_images = [] + for i in range(blocks): + box = ((i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size) + # split the image + split_img = resized_img.crop(box) + processed_images.append(split_img) + + assert len(processed_images) == blocks + + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + + return processed_images + + +def get_nemotron_vl_target_ratios( + min_num: int, + max_num: int, +) -> list[tuple[int, int]]: + target_ratios = {(i, j) + for n in range(min_num, max_num + 1) + for i in range(1, n + 1) + for j in range(1, n + 1) if min_num <= i * j <= max_num} + return sorted(target_ratios, key=lambda x: x[0] * x[1]) + + +def image_to_pixel_values_nemotron_vl( + image: Image.Image, + *, + input_size: int, + min_num: int, + max_num: int, + use_thumbnail: bool, +) -> torch.Tensor: + target_ratios = get_nemotron_vl_target_ratios(min_num, max_num) + + transform = build_transform(input_size=input_size) + + images = dynamic_preprocess_nemotron_vl( + image, + target_ratios=target_ratios, + image_size=input_size, + use_thumbnail=use_thumbnail, + ) + + pixel_values = torch.stack([transform(image) for image in images]) + return pixel_values + + class NemotronVLProcessor(InternVLProcessor): def __init__( @@ -87,6 +229,50 @@ class NemotronVLProcessor(InternVLProcessor): def image_token_id(self) -> int: return self.tokenizer.get_vocab()[IMG_CONTEXT] + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + target_ratios = self.resolve_target_ratios( + use_thumbnail=False, # Applied in calculate_targets + ) + + num_patches, _, _ = calculate_nemotron_vl_targets( + orig_width=image_width, + orig_height=image_height, + image_size=self.image_size, + target_ratios=target_ratios, + use_thumbnail=self.use_thumbnail, + ) + + return num_patches * self.num_image_token + + def _images_to_pixel_values_lst( + self, + images: list[Image.Image], + min_dynamic_patch: Optional[int] = None, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + ) -> list[torch.Tensor]: + min_num, max_num = self.resolve_min_max_num( + min_dynamic_patch=min_dynamic_patch, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + use_thumbnail=False, # Applied in image_to_pixel_values + ) + + return [ + image_to_pixel_values_nemotron_vl( + image, + input_size=self.image_size, + min_num=min_num, + max_num=max_num, + use_thumbnail=self.use_thumbnail, + ) for image in images + ] + def _preprocess_image( self, text: list[str], From 3f52738dce57360ccc92c9993c5adcaaec1f5ac2 Mon Sep 17 00:00:00 2001 From: 633WHU Date: Wed, 13 Aug 2025 19:10:07 +0800 Subject: [PATCH 229/932] [Doc] Add max_lora_rank configuration guide (#22782) Signed-off-by: chiliu --- docs/features/lora.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/features/lora.md b/docs/features/lora.md index a4e05dae11..668460a368 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -351,3 +351,22 @@ vllm serve ibm-granite/granite-speech-3.3-2b \ ``` Note: Default multimodal LoRAs are currently only available for `.generate` and chat completions. + +## Using Tips + +### Configuring `max_lora_rank` + +The `--max-lora-rank` parameter controls the maximum rank allowed for LoRA adapters. This setting affects memory allocation and performance: + +- **Set it to the maximum rank** among all LoRA adapters you plan to use +- **Avoid setting it too high** - using a value much larger than needed wastes memory and can cause performance issues + +For example, if your LoRA adapters have ranks [16, 32, 64], use `--max-lora-rank 64` rather than 256 + +```bash +# Good: matches actual maximum rank +vllm serve model --enable-lora --max-lora-rank 64 + +# Bad: unnecessarily high, wastes memory +vllm serve model --enable-lora --max-lora-rank 256 +``` From d94e3026ded838bc0c3eec9e0a0b4b3affa0cbc9 Mon Sep 17 00:00:00 2001 From: Giancarlo Delfin <32987265+TheEpicDolphin@users.noreply.github.com> Date: Wed, 13 Aug 2025 04:11:28 -0700 Subject: [PATCH 230/932] [V1] Add tree drafting tests for eagle spec decoding (#22705) Signed-off-by: Giancarlo Delfin --- tests/v1/spec_decode/test_eagle.py | 160 +++++++++++++++++++++++- tests/v1/spec_decode/test_max_len.py | 6 - vllm/v1/attention/backends/tree_attn.py | 6 +- vllm/v1/spec_decode/eagle.py | 61 +++------ 4 files changed, 178 insertions(+), 55 deletions(-) diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 2b4f8bd2a8..7b8445a0b2 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional from unittest import mock import pytest @@ -23,7 +24,11 @@ eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" -def _create_proposer(method: str, k: int) -> EagleProposer: +def _create_proposer( + method: str, + num_speculative_tokens: int, + speculative_token_tree: Optional[list[tuple[int]]] = None, +) -> EagleProposer: model_config = ModelConfig(model=model_dir, runner="generate", max_model_len=100) @@ -31,12 +36,18 @@ def _create_proposer(method: str, k: int) -> EagleProposer: # Choose model directory based on method draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir + spec_token_tree_str = None + if speculative_token_tree is not None: + assert num_speculative_tokens == len(speculative_token_tree) + spec_token_tree_str = str(speculative_token_tree) + speculative_config = SpeculativeConfig( target_model_config=model_config, target_parallel_config=ParallelConfig(), model=draft_model_dir, method=method, - num_speculative_tokens=k, + num_speculative_tokens=num_speculative_tokens, + speculative_token_tree=spec_token_tree_str, ) vllm_config = VllmConfig( @@ -189,7 +200,7 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method, target_model.lm_head = mock.MagicMock() # Create proposer using the helper function - proposer = _create_proposer(method, k=8) + proposer = _create_proposer(method, num_speculative_tokens=8) # Call the method under test proposer.load_model(target_model) @@ -226,6 +237,10 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): pytest.skip("TRITON_ATTN_VLLM_V1 does not support " "multi-token eagle spec decode on current platform") + if (attn_backend == "TREE_ATTN"): + pytest.skip("TREE_ATTN is tested separately in test_propose_tree" + "because it requires special input mocking.") + if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm(): monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") @@ -378,3 +393,142 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): # Verify all tokens match our expectations assert torch.equal(result, expected_tokens) + + +@pytest.mark.parametrize( + "spec_token_tree", + [ + [(0, )], # A single token + [(0, ), (0, 0), (0, 0, 0)], # Chain + [(0, ), (1, ), (2, )], # Parallel + [(0, ), (1, ), (2, ), (0, 0), (0, 1), (1, 0), (1, 1), (2, 0), + (2, 1)], # Tree + ]) +def test_propose_tree(spec_token_tree): + # Get GPU device. + device = torch.device(current_platform.device_type) + + # Setup test parameters. + batch_size = 2 + seq_len_1 = 5 + seq_len_2 = 3 + total_tokens = seq_len_1 + seq_len_2 + vocab_size = 100 + seq_lens = [seq_len_1, seq_len_2] + num_speculative_tokens = len(spec_token_tree) + + # Create proposer first so we can use its actual hidden_size. + proposer = _create_proposer("eagle", + num_speculative_tokens, + speculative_token_tree=spec_token_tree) + # Get the hidden_size from the proposer to ensure consistency. + hidden_size = proposer.hidden_size + + # Helper to create deterministic logits that will produce specific tokens + def create_deterministic_logits(token_ids, k: int): + logits = torch.full((batch_size, vocab_size), -100.0, device=device) + for i, token_id in enumerate(token_ids): + # Assign decreasing values to the k, consecutive, tokens. + for j in range(k): + logits[i, token_id + j] = 100.0 - j + return logits + + # Mock a model that returns deterministic logits. + base_token_ids = torch.tensor([42, 60], dtype=torch.int64, device=device) + + # Skip loading the model and replace it with a mock that returns + # deterministic outputs. + model_mock = mock.MagicMock() + + # Mock the model forward calls. + forward_returns = [(torch.zeros(total_tokens, hidden_size, device=device), + torch.zeros(total_tokens, hidden_size, device=device))] + for cu_num_drafts in proposer.cu_drafts_per_level: + h_logits = torch.zeros(batch_size * cu_num_drafts, + hidden_size, + device=device) + h_states = torch.zeros(batch_size * cu_num_drafts, + hidden_size, + device=device) + forward_returns.append((h_logits, h_states)) + model_mock.side_effect = forward_returns + + # Mock the compute_logits calls. + cu_num_drafts_tensor = torch.tensor([0] + proposer.cu_drafts_per_level, + dtype=torch.int32, + device=device) + logits_returns = [] + for level, num_children in enumerate(proposer.child_drafts_per_level): + token_ids = base_token_ids + cu_num_drafts_tensor[level] + level_num_drafts = cu_num_drafts_tensor[ + level + 1] - cu_num_drafts_tensor[level] + level_logits = [] + for i in range(level_num_drafts // num_children): + level_logits.append( + create_deterministic_logits(token_ids + i * num_children, + num_children)) + logits_returns.append(torch.stack(level_logits, dim=1)) + model_mock.compute_logits.side_effect = logits_returns + + # Assign the mock to the proposer + proposer.model = model_mock + + # Assign draft attn_layer_names since load_model is not invoked + proposer.attn_layer_names = ["layer.0"] + + # Get the tree attention metadata builder. + attn_metadata_builder_cls, _ = get_attention_backend(_Backend.TREE_ATTN) + attn_metadata_builder = attn_metadata_builder_cls( + kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config), + layer_names=proposer.attn_layer_names, + vllm_config=proposer.vllm_config, + device=device, + ) + + # Mock runner for attention metadata building. + proposer.runner = mock.MagicMock() + proposer.runner.attn_groups.append([mock.MagicMock()]) + proposer.runner.attn_groups[0][0].metadata_builder = attn_metadata_builder + + # Setup inputs for the proposer. + target_token_ids = torch.randint(0, + vocab_size, (total_tokens, ), + device=device) + target_positions = torch.cat([ + torch.arange(seq_len_1, device=device), + torch.arange(seq_len_2, device=device) + ]) + target_hidden_states = torch.randn(total_tokens, + hidden_size, + device=device) + next_token_ids = torch.randint(0, + vocab_size, (batch_size, ), + dtype=torch.int32, + device=device) + batch_spec = BatchSpec( + seq_lens=seq_lens, + query_lens=seq_lens, + ) + common_attn_metadata = create_common_attn_metadata( + batch_spec, + block_size=16, + device=device, + ) + sampling_metadata = mock.MagicMock() + + # Propose draft tokens. + result = proposer.propose(target_token_ids=target_token_ids, + target_positions=target_positions, + target_hidden_states=target_hidden_states, + next_token_ids=next_token_ids, + common_attn_metadata=common_attn_metadata, + sampling_metadata=sampling_metadata) + assert result.shape == (batch_size, num_speculative_tokens) + + # The tokens are expected to be consecutive integers starting + # from the base token IDs. + expected_tokens = base_token_ids[:, None] + torch.arange( + num_speculative_tokens, dtype=torch.int64, device=device) + + # Verify that the draft tokens match our expectations. + assert torch.equal(result, expected_tokens) diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py index 01019b29e0..a5b10bb518 100644 --- a/tests/v1/spec_decode/test_max_len.py +++ b/tests/v1/spec_decode/test_max_len.py @@ -39,12 +39,6 @@ def test_eagle_max_len(monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") - - if attn_backend == "TREE_ATTN" and num_speculative_tokens > 1: - # TREE_ATTN fails the test with multi-token spec decode - # TODO: Investigate why - pytest.skip("TREE_ATTN fails the test") - m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) if (attn_backend == "TRITON_ATTN_VLLM_V1" diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py index 3b53b039f1..5d10e9e260 100644 --- a/vllm/v1/attention/backends/tree_attn.py +++ b/vllm/v1/attention/backends/tree_attn.py @@ -236,9 +236,9 @@ class TreeAttentionMetadataBuilder( # Use prefill for drafting at the root level. self.tree_attn_bias = torch.empty(0) else: - # Slice the tree attention bias for drafting. - query_len = common_attn_metadata.max_query_len - start, end = draft_index, draft_index + query_len + # Slice the tree attention bias for drafting. Exclude + # the root level. + start, end = 1, 1 + common_attn_metadata.max_query_len self.tree_attn_bias = self.tree_attn_bias[start:end, start:end].contiguous() diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index f75d76dd97..a8a160a0f9 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -113,13 +113,6 @@ class EagleProposer: num_drafts_per_level[level]) self.child_drafts_per_level.append(num_drafts_per_level[level] // num_drafts_per_level[level - 1]) - # Find the first level where the tree branches off into one or more - # children. - self.first_branching_level = None - for level in range(tree_depth): - if self.cu_drafts_per_level[level] > level + 1: - self.first_branching_level = level - break # Precompute draft position offsets in flattened tree. self.tree_draft_pos_offsets = torch.arange( 1, @@ -209,11 +202,10 @@ class EagleProposer: logits = self.model.compute_logits(sample_hidden_states, None) positions = target_positions[last_token_indices] hidden_states = hidden_states[last_token_indices] - if self.first_branching_level == 0: - # Branching has occurred at the root level. Draft using tree - # attention. + + if isinstance(attn_metadata, TreeAttentionMetadata): + # Draft using tree attention. draft_token_ids_list = self.propose_tree( - tree_root_level=0, batch_size=batch_size, logits=logits, positions=positions, @@ -242,11 +234,10 @@ class EagleProposer: (TritonAttentionMetadata, AiterFlashAttentionMetadata, FlashAttentionMetadata)) else: - # Currently, only FlashAttention and TreeAttention support - # multi-token eagle spec decode. This is because the code below - # makes assumptions about attn_metadata attributes available. - assert isinstance(attn_metadata, - (FlashAttentionMetadata, TreeAttentionMetadata)) + # Currently, only FlashAttention supports multi-token eagle spec + # decode. This is because the code below makes assumptions about + # attn_metadata attributes available. + assert isinstance(attn_metadata, FlashAttentionMetadata) # Generate the remaining draft tokens. draft_token_ids_list = [draft_token_ids] @@ -259,7 +250,7 @@ class EagleProposer: attn_metadata.num_actual_tokens = batch_size attn_metadata.max_query_len = 1 attn_metadata.query_start_loc = self.arange[:batch_size + 1] - for token_index in range(self.num_speculative_tokens - 1): + for _ in range(self.num_speculative_tokens - 1): # Update the inputs. # cast to int32 is crucial when eagle model is compiled. # tensor.argmax() returns int64 by default. @@ -327,21 +318,6 @@ class EagleProposer: hidden_states = hidden_states[:batch_size] logits = self.model.compute_logits(last_hidden_states[:batch_size], None) - - if self.first_branching_level == token_index + 1: - # Branching has occurred. The remaining tokens are drafted - # using tree attention. - draft_token_ids_list += self.propose_tree( - tree_root_level=token_index + 1, - batch_size=batch_size, - logits=logits, - positions=positions, - hidden_states=hidden_states, - common_attn_metadata=common_attn_metadata, - ) - # [batch_size, num_tree_tokens] - return torch.cat(draft_token_ids_list, dim=1) - draft_token_ids = logits.argmax(dim=-1) draft_token_ids_list.append(draft_token_ids) @@ -351,7 +327,6 @@ class EagleProposer: def propose_tree( self, - tree_root_level: int, batch_size: int, # [num_tokens, vocab_size] logits: torch.Tensor, @@ -366,10 +341,10 @@ class EagleProposer: assert isinstance(tree_attn_metadata_builder, TreeAttentionMetadataBuilder) - total_num_drafts = self.cu_drafts_per_level[tree_root_level] + total_num_drafts = self.cu_drafts_per_level[0] level_num_drafts = total_num_drafts # Sample a draft token for each child at the tree root level. - num_children = self.child_drafts_per_level[tree_root_level] + num_children = self.child_drafts_per_level[0] if num_children == 1: draft_token_ids = logits.argmax(dim=-1).view(batch_size, -1) else: @@ -393,22 +368,23 @@ class EagleProposer: positions.view(batch_size, -1) + self.tree_draft_pos_offsets[:batch_size, :]) tree_depth = len(self.cu_drafts_per_level) - for level in range(tree_root_level, tree_depth - 1): + for level in range(tree_depth - 1): # Get draft positions for RoPE. draft_positions = positions + (level + 1) exceeds_max_model_len = (positions + total_num_drafts) >= self.max_model_len # Mask out the position ids that exceed the max model length. # Otherwise, we may get out-of-range error in RoPE. - clamped_draft_positions = torch.where( + draft_positions = torch.where( exceeds_max_model_len, 0, draft_positions, - ) + ).view(batch_size, -1) + if level_num_drafts > 1: # Repeat the positions for each draft at this level. - draft_positions = clamped_draft_positions.repeat_interleave( - level_num_drafts).reshape(batch_size, -1) + draft_positions = draft_positions.repeat_interleave( + level_num_drafts, dim=1) if num_children > 1: # Repeat draft hidden states for each child. @@ -425,7 +401,7 @@ class EagleProposer: # Build new attention metadata for the next level of drafts. # This is necessary to support tree attention. - query_len = total_num_drafts - tree_root_level + query_len = total_num_drafts common_attn_metadata = replace( common_attn_metadata, query_start_loc=query_len * self.arange[:batch_size + 1], @@ -435,7 +411,7 @@ class EagleProposer: ) attn_metadata = tree_attn_metadata_builder.build_for_drafting( common_attn_metadata=common_attn_metadata, - draft_index=tree_root_level + 1, + draft_index=level + 1, ) # Apply new attention metadata to all layers. @@ -516,7 +492,6 @@ class EagleProposer: level_num_drafts = self.cu_drafts_per_level[level + 1] - total_num_drafts total_num_drafts = self.cu_drafts_per_level[level + 1] - return draft_token_ids_list def prepare_inputs( From 0b1bdac6af33b890a4d68321df05e71a1ba43dc4 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Wed, 13 Aug 2025 19:12:00 +0800 Subject: [PATCH 231/932] [Platform] Custom ops support for FusedMoe (#22509) Signed-off-by: wangxiyuan --- vllm/model_executor/layers/fused_moe/layer.py | 3 ++- vllm/model_executor/layers/linear.py | 12 ++++++------ .../layers/vocab_parallel_embedding.py | 4 +++- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 8ef0a805d8..ddc02168e5 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -682,7 +682,8 @@ def determine_expert_map( return (local_num_experts, expert_map) -class FusedMoE(torch.nn.Module): +@CustomOp.register("fused_moe") +class FusedMoE(CustomOp): """FusedMoE layer for MoE models. This layer contains both MergedColumnParallel weights (gate_up_proj / diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index bb81a663d4..75391c51f7 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -16,6 +16,7 @@ from vllm.distributed import (divide, get_tensor_model_parallel_rank, tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce) from vllm.logger import init_logger +from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.utils import dispatch_unquantized_gemm @@ -226,7 +227,7 @@ class UnquantizedLinearMethod(LinearMethodBase): return dispatch_unquantized_gemm()(layer, x, layer.weight, bias) -class LinearBase(torch.nn.Module): +class LinearBase(CustomOp): """Base linear layer. Args: @@ -269,12 +270,8 @@ class LinearBase(torch.nn.Module): prefix=prefix) self.return_bias = return_bias - def forward( - self, x: torch.Tensor - ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]: - raise NotImplementedError - +@CustomOp.register("replicated_linear") class ReplicatedLinear(LinearBase): """Replicated linear layer. @@ -443,6 +440,7 @@ class MergedReplicatedLinear(ReplicatedLinear): param[shard_offset:shard_offset + shard_size] = loaded_weight +@CustomOp.register("column_parallel_linear") class ColumnParallelLinear(LinearBase): """Linear layer with column parallelism. @@ -1229,6 +1227,7 @@ class QKVParallelLinear(ColumnParallelLinear): param_data.copy_(loaded_weight) +@CustomOp.register("row_parallel_linear") class RowParallelLinear(LinearBase): """Linear layer with row parallelism. @@ -1405,6 +1404,7 @@ class RowParallelLinear(LinearBase): return s +@CustomOp.register("qkv_cross_parallel_linear") class QKVCrossParallelLinear(LinearBase): """Linear layers for efficient cross-attention's QKV transformation. diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index a5f262c832..9f223998e5 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -12,6 +12,7 @@ from torch.nn.parameter import Parameter, UninitializedParameter from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) +from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, method_has_implemented_embedding) from vllm.model_executor.layers.utils import dispatch_unquantized_gemm @@ -159,7 +160,8 @@ def get_masked_input_and_mask( return input_, ~vocab_mask -class VocabParallelEmbedding(torch.nn.Module): +@CustomOp.register("vocab_parallel_embedding") +class VocabParallelEmbedding(CustomOp): """Embedding parallelized in the vocabulary dimension. Adapted from torch.nn.Embedding, note that we pad the vocabulary size to From 653124bd46c57770b151eb58cc2a59170753daa5 Mon Sep 17 00:00:00 2001 From: Kdump Date: Wed, 13 Aug 2025 19:14:24 +0800 Subject: [PATCH 232/932] [Frontend] Add chunked processing to handle long inputs in embedding models (#22280) Signed-off-by: x22x22 Signed-off-by: Kdump Signed-off-by: DarkLight1337 Co-authored-by: Cyrus Leung Co-authored-by: Maximilien de Bayser Co-authored-by: DarkLight1337 --- .../openai_embedding_long_text/README.md | 186 +++++++ .../openai_embedding_long_text/client.py | 366 ++++++++++++++ .../openai_embedding_long_text/service.sh | 137 ++++++ .../openai/test_embedding_long_text.py | 441 +++++++++++++++++ vllm/config/__init__.py | 19 + vllm/entrypoints/openai/serving_embedding.py | 457 +++++++++++++++++- 6 files changed, 1603 insertions(+), 3 deletions(-) create mode 100644 examples/online_serving/openai_embedding_long_text/README.md create mode 100644 examples/online_serving/openai_embedding_long_text/client.py create mode 100644 examples/online_serving/openai_embedding_long_text/service.sh create mode 100644 tests/entrypoints/openai/test_embedding_long_text.py diff --git a/examples/online_serving/openai_embedding_long_text/README.md b/examples/online_serving/openai_embedding_long_text/README.md new file mode 100644 index 0000000000..04edc4680e --- /dev/null +++ b/examples/online_serving/openai_embedding_long_text/README.md @@ -0,0 +1,186 @@ +# Long Text Embedding with Chunked Processing + +This directory contains examples for using vLLM's **chunked processing** feature to handle long text embedding that exceeds the model's maximum context length. + +## 🚀 Quick Start + +### Start the Server + +Use the provided script to start a vLLM server with chunked processing enabled: + +```bash +# Basic usage (supports very long texts up to ~3M tokens) +./service.sh + +# Custom configuration with different models +MODEL_NAME="jinaai/jina-embeddings-v3" \ +MAX_EMBED_LEN=1048576 \ +./service.sh + +# For extremely long documents +MODEL_NAME="intfloat/multilingual-e5-large" \ +MAX_EMBED_LEN=3072000 \ +./service.sh +``` + +### Test Long Text Embedding + +Run the comprehensive test client: + +```bash +python client.py +``` + +## 📁 Files + +| File | Description | +|------|-------------| +| `service.sh` | Server startup script with chunked processing enabled | +| `client.py` | Comprehensive test client for long text embedding | + +## ⚙️ Configuration + +### Server Configuration + +The key parameters for chunked processing are in the `--override-pooler-config`: + +```json +{ + "pooling_type": "auto", + "normalize": true, + "enable_chunked_processing": true, + "max_embed_len": 3072000 +} +``` + +!!! note + `pooling_type` sets the model's own pooling strategy for processing within each chunk. The cross-chunk aggregation automatically uses MEAN strategy when input exceeds the model's native maximum length. + +#### Chunked Processing Behavior + +Chunked processing uses **MEAN aggregation** for cross-chunk combination when input exceeds the model's native maximum length: + +| Component | Behavior | Description | +|-----------|----------|-------------| +| **Within chunks** | Model's native pooling | Uses the model's configured pooling strategy | +| **Cross-chunk aggregation** | Always MEAN | Weighted averaging based on chunk token counts | +| **Performance** | Optimal | All chunks processed for complete semantic coverage | + +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `MODEL_NAME` | `intfloat/multilingual-e5-large` | Embedding model to use (supports multiple models) | +| `PORT` | `31090` | Server port | +| `GPU_COUNT` | `1` | Number of GPUs to use | +| `MAX_EMBED_LEN` | `3072000` | Maximum embedding input length (supports very long documents) | +| `POOLING_TYPE` | `auto` | Model's native pooling type: `auto`, `MEAN`, `CLS`, `LAST` (only affects within-chunk pooling, not cross-chunk aggregation) | +| `API_KEY` | `EMPTY` | API key for authentication | + +## 🔧 How It Works + +1. **Enhanced Input Validation**: `max_embed_len` allows accepting inputs longer than `max_model_len` without environment variables +2. **Smart Chunking**: Text is split based on `max_position_embeddings` to maintain semantic integrity +3. **Unified Processing**: All chunks processed separately through the model using its configured pooling strategy +4. **MEAN Aggregation**: When input exceeds model's native length, results combined using token count-based weighted averaging across all chunks +5. **Consistent Output**: Final embeddings maintain the same dimensionality as standard processing + +### Input Length Handling + +- **Within max_embed_len**: Input is accepted and processed (up to 3M+ tokens) +- **Exceeds max_position_embeddings**: Chunked processing is automatically triggered +- **Exceeds max_embed_len**: Input is rejected with clear error message +- **No environment variables required**: Works without `VLLM_ALLOW_LONG_MAX_MODEL_LEN` + +### Extreme Long Text Support + +With `MAX_EMBED_LEN=3072000`, you can process: + +- **Academic papers**: Full research papers with references +- **Legal documents**: Complete contracts and legal texts +- **Books**: Entire chapters or small books +- **Code repositories**: Large codebases and documentation + +## 📊 Performance Characteristics + +### Chunked Processing Performance + +| Aspect | Behavior | Performance | +|--------|----------|-------------| +| **Chunk Processing** | All chunks processed with native pooling | Consistent with input length | +| **Cross-chunk Aggregation** | MEAN weighted averaging | Minimal overhead | +| **Memory Usage** | Proportional to number of chunks | Moderate, scalable | +| **Semantic Quality** | Complete text coverage | Optimal for long documents | + +## 🧪 Test Cases + +The test client demonstrates: + +- ✅ **Short text**: Normal processing (baseline) +- ✅ **Medium text**: Single chunk processing +- ✅ **Long text**: Multi-chunk processing with aggregation +- ✅ **Very long text**: Many chunks processing +- ✅ **Extreme long text**: Document-level processing (100K+ tokens) +- ✅ **Batch processing**: Mixed-length inputs in one request +- ✅ **Consistency**: Reproducible results across runs + +## 🐛 Troubleshooting + +### Common Issues + +1. **Chunked processing not enabled**: + + ```log + ValueError: This model's maximum position embeddings length is 4096 tokens... + ``` + + **Solution**: Ensure `enable_chunked_processing: true` in pooler config + +2. **Input exceeds max_embed_len**: + + ```log + ValueError: This model's maximum embedding input length is 3072000 tokens... + ``` + + **Solution**: Increase `max_embed_len` in pooler config or reduce input length + +3. **Memory errors**: + + ```log + RuntimeError: CUDA out of memory + ``` + + **Solution**: Reduce chunk size by adjusting model's `max_position_embeddings` or use fewer GPUs + +4. **Slow processing**: + **Expected**: Long text takes more time due to multiple inference calls + +### Debug Information + +Server logs show chunked processing activity: + +```log +INFO: Input length 150000 exceeds max_position_embeddings 4096, will use chunked processing +INFO: Split input of 150000 tokens into 37 chunks (max_chunk_size: 4096) +``` + +## 🤝 Contributing + +To extend chunked processing support to other embedding models: + +1. Check model compatibility with the pooling architecture +2. Test with various text lengths +3. Validate embedding quality compared to single-chunk processing +4. Submit PR with test cases and documentation updates + +## 🆕 Enhanced Features + +### max_embed_len Parameter + +The new `max_embed_len` parameter provides: + +- **Simplified Configuration**: No need for `VLLM_ALLOW_LONG_MAX_MODEL_LEN` environment variable +- **Flexible Input Validation**: Accept inputs longer than `max_model_len` up to `max_embed_len` +- **Extreme Length Support**: Process documents with millions of tokens +- **Clear Error Messages**: Better feedback when inputs exceed limits +- **Backward Compatibility**: Existing configurations continue to work diff --git a/examples/online_serving/openai_embedding_long_text/client.py b/examples/online_serving/openai_embedding_long_text/client.py new file mode 100644 index 0000000000..6e9838ac6d --- /dev/null +++ b/examples/online_serving/openai_embedding_long_text/client.py @@ -0,0 +1,366 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Example script demonstrating long text embedding with chunked processing in vLLM. + +This example shows how to use vLLM's chunked processing feature to handle text +inputs that exceed the model's maximum token length. The feature automatically +splits long text into chunks and handles different pooling types optimally. + +Prerequisites: +1. Start vLLM server with chunked processing enabled: + + # MEAN pooling (processes all chunks, recommended for complete coverage) + vllm serve intfloat/multilingual-e5-large \ + --override-pooler-config \ + '{"pooling_type": "MEAN", "normalize": true, ' \ + '"enable_chunked_processing": true, "max_embed_len": 3072000}' \ + --served-model-name multilingual-e5-large \ + --trust-remote-code \ + --port 31090 \ + --api-key your-api-key + + # OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks) + vllm serve BAAI/bge-large-en-v1.5 \ + --override-pooler-config \ + '{"pooling_type": "CLS", "normalize": true, ' \ + '"enable_chunked_processing": true, "max_embed_len": 1048576}' \ + --served-model-name bge-large-en-v1.5 \ + --trust-remote-code \ + --port 31090 \ + --api-key your-api-key + +2. Install required dependencies: + pip install openai requests +""" + +import time + +import numpy as np +from openai import OpenAI + +# Configuration +API_KEY = "your-api-key" # Replace with your actual API key +BASE_URL = "http://localhost:31090/v1" +MODEL_NAME = "multilingual-e5-large" + + +def generate_long_text(base_text: str, repeat_count: int) -> str: + """Generate long text by repeating base text.""" + return base_text * repeat_count + + +def test_embedding_with_different_lengths(): + """Test embedding generation with different text lengths.""" + client = OpenAI(api_key=API_KEY, base_url=BASE_URL) + + # Test cases with different text lengths + test_cases = [ + { + "name": "Short Text", + "text": "Hello, this is a short text for embedding.", + "expected_chunks": 1, + }, + { + "name": "Medium Text", + "text": generate_long_text( + "This is a medium-length text that should fit within the " + "model's context window. " * 20, + 2, + ), + "expected_chunks": 1, + }, + { + "name": "Long Text (2 chunks)", + "text": generate_long_text( + "This is a very long text that will exceed the model's " + "maximum context length and trigger chunked processing. " * 50, + 5, + ), + "expected_chunks": 2, + }, + { + "name": "Very Long Text (3+ chunks)", + "text": generate_long_text( + "This text is extremely long and will definitely " + "require multiple chunks for processing. " * 100, + 10, + ), + "expected_chunks": 3, + }, + ] + + print("🧪 Testing vLLM Long Text Embedding with Chunked Processing") + print("=" * 70) + + for i, test_case in enumerate(test_cases, 1): + print(f"\n📝 Test {i}: {test_case['name']}") + print(f"Text length: {len(test_case['text'])} characters") + + try: + start_time = time.time() + + response = client.embeddings.create( + input=test_case["text"], model=MODEL_NAME, encoding_format="float" + ) + + end_time = time.time() + processing_time = end_time - start_time + + # Extract embedding data + embedding = response.data[0].embedding + embedding_dim = len(embedding) + + print("✅ Success!") + print(f" - Embedding dimension: {embedding_dim}") + print(f" - Processing time: {processing_time:.2f}s") + print(f" - Expected chunks: ~{test_case['expected_chunks']}") + print(f" - First 5 values: {embedding[:5]}") + + except Exception as e: + print(f"❌ Failed: {str(e)}") + + +def test_batch_embedding(): + """Test batch embedding with mixed-length inputs.""" + client = OpenAI(api_key=API_KEY, base_url=BASE_URL) + + print("\n🔄 Testing Batch Embedding with Mixed Lengths") + print("=" * 50) + + # Mix of short and long texts + batch_inputs = [ + "Short text 1", + generate_long_text("Medium length text that fits in one chunk. " * 20, 1), + "Another short text", + generate_long_text("Long text requiring chunked processing. " * 100, 5), + ] + + try: + start_time = time.time() + + response = client.embeddings.create( + input=batch_inputs, model=MODEL_NAME, encoding_format="float" + ) + + end_time = time.time() + processing_time = end_time - start_time + + print("✅ Batch processing successful!") + print(f" - Number of inputs: {len(batch_inputs)}") + print(f" - Number of embeddings: {len(response.data)}") + print(f" - Total processing time: {processing_time:.2f}s") + print( + f" - Average time per input: {processing_time / len(batch_inputs):.2f}s" + ) + + for i, data in enumerate(response.data): + input_length = len(batch_inputs[i]) + embedding_dim = len(data.embedding) + print( + f" - Input {i + 1}: {input_length} chars → {embedding_dim}D embedding" + ) + + except Exception as e: + print(f"❌ Batch processing failed: {str(e)}") + + +def test_multiple_long_texts_batch(): + """Test batch processing with multiple long texts to verify chunk ID uniqueness.""" + client = OpenAI(api_key=API_KEY, base_url=BASE_URL) + + print("\n🔧 Testing Multiple Long Texts in Batch (Chunk ID Fix Verification)") + print("=" * 70) + + # Create multiple distinct long texts that will all require chunking + # Note: All pooling types now use MEAN aggregation across chunks: + # - Native pooling (MEAN/CLS/LAST) is used within each chunk + # - MEAN aggregation combines results across all chunks + # - Full semantic coverage for all pooling types + long_texts = [ + generate_long_text( + "First long document about artificial intelligence and machine learning. " + * 80, + 6, + ), + generate_long_text( + "Second long document about natural language processing and transformers. " + * 80, + 6, + ), + generate_long_text( + "Third long document about computer vision and neural networks. " * 80, 6 + ), + ] + + # Add some short texts to mix things up + batch_inputs = [ + "Short text before long texts", + long_texts[0], + "Short text between long texts", + long_texts[1], + long_texts[2], + "Short text after long texts", + ] + + print("📊 Batch composition:") + for i, text in enumerate(batch_inputs): + length = len(text) + text_type = "Long (will be chunked)" if length > 5000 else "Short" + print(f" - Input {i + 1}: {length} chars ({text_type})") + + try: + start_time = time.time() + + response = client.embeddings.create( + input=batch_inputs, model=MODEL_NAME, encoding_format="float" + ) + + end_time = time.time() + processing_time = end_time - start_time + + print("\n✅ Multiple long texts batch processing successful!") + print(f" - Number of inputs: {len(batch_inputs)}") + print(f" - Number of embeddings returned: {len(response.data)}") + print(f" - Total processing time: {processing_time:.2f}s") + + # Verify each embedding is different (no incorrect aggregation) + embeddings = [data.embedding for data in response.data] + + if len(embeddings) >= 3: + import numpy as np + + # Compare embeddings of the long texts (indices 1, 3, 4) + long_embeddings = [ + np.array(embeddings[1]), # First long text + np.array(embeddings[3]), # Second long text + np.array(embeddings[4]), # Third long text + ] + + print("\n🔍 Verifying embedding uniqueness:") + for i in range(len(long_embeddings)): + for j in range(i + 1, len(long_embeddings)): + cosine_sim = np.dot(long_embeddings[i], long_embeddings[j]) / ( + np.linalg.norm(long_embeddings[i]) + * np.linalg.norm(long_embeddings[j]) + ) + print( + f" - Similarity between long text {i + 1} and {j + 1}: " + f"{cosine_sim:.4f}" + ) + + if ( + cosine_sim < 0.9 + ): # Different content should have lower similarity + print(" ✅ Good: Embeddings are appropriately different") + else: + print( + " ⚠️ High similarity - may indicate chunk " + "aggregation issue" + ) + + print("\n📋 Per-input results:") + for i, data in enumerate(response.data): + input_length = len(batch_inputs[i]) + embedding_dim = len(data.embedding) + embedding_norm = np.linalg.norm(data.embedding) + print( + f" - Input {i + 1}: {input_length} chars → {embedding_dim}D " + f"embedding (norm: {embedding_norm:.4f})" + ) + + print( + "\n✅ This test verifies the fix for chunk ID collisions in " + "batch processing" + ) + print(" - Before fix: Multiple long texts would have conflicting chunk IDs") + print(" - After fix: Each prompt's chunks have unique IDs with prompt index") + + except Exception as e: + print(f"❌ Multiple long texts batch test failed: {str(e)}") + print(" This might indicate the chunk ID collision bug is present!") + + +def test_embedding_consistency(): + """Test that chunked processing produces consistent results.""" + client = OpenAI(api_key=API_KEY, base_url=BASE_URL) + + print("\n🔍 Testing Embedding Consistency") + print("=" * 40) + + # Use the same long text multiple times + long_text = generate_long_text( + "Consistency test text for chunked processing validation. " * 50, 3 + ) + + embeddings = [] + + try: + for i in range(3): + response = client.embeddings.create( + input=long_text, model=MODEL_NAME, encoding_format="float" + ) + embeddings.append(response.data[0].embedding) + print(f" - Generated embedding {i + 1}") + + # Check consistency (embeddings should be identical) + if len(embeddings) >= 2: + # Calculate similarity between first two embeddings + + emb1 = np.array(embeddings[0]) + emb2 = np.array(embeddings[1]) + + # Cosine similarity + cosine_sim = np.dot(emb1, emb2) / ( + np.linalg.norm(emb1) * np.linalg.norm(emb2) + ) + + print("✅ Consistency test completed!") + print(f" - Cosine similarity between runs: {cosine_sim:.6f}") + print(" - Expected: ~1.0 (identical embeddings)") + + if cosine_sim > 0.999: + print(" - ✅ High consistency achieved!") + else: + print(" - ⚠️ Consistency may vary due to numerical precision") + + except Exception as e: + print(f"❌ Consistency test failed: {str(e)}") + + +def main(): + """Main function to run all tests.""" + print("🚀 vLLM Long Text Embedding Client") + print(f"📡 Connecting to: {BASE_URL}") + print(f"🤖 Model: {MODEL_NAME}") + masked_key = "*" * (len(API_KEY) - 4) + API_KEY[-4:] if len(API_KEY) > 4 else "****" + print(f"🔑 API Key: {masked_key}") + + # Run all test cases + test_embedding_with_different_lengths() + test_batch_embedding() + test_multiple_long_texts_batch() + test_embedding_consistency() + + print("\n" + "=" * 70) + print("🎉 All tests completed!") + print("\n💡 Key Features Demonstrated:") + print(" - ✅ Automatic chunked processing for long text") + print(" - ✅ Seamless handling of mixed-length batches") + print(" - ✅ Multiple long texts in single batch (chunk ID fix)") + print(" - ✅ Unified chunked processing:") + print(" • Native pooling used within each chunk") + print(" • MEAN aggregation across all chunks") + print(" • Complete semantic coverage for all pooling types") + print(" - ✅ Consistent embedding generation") + print(" - ✅ Backward compatibility with short text") + print("\n📚 For more information, see:") + print( + " - Documentation: https://docs.vllm.ai/en/latest/models/pooling_models.html" + ) + print(" - Chunked Processing Guide: openai_embedding_long_text.md") + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/online_serving/openai_embedding_long_text/service.sh new file mode 100644 index 0000000000..f356d7d452 --- /dev/null +++ b/examples/online_serving/openai_embedding_long_text/service.sh @@ -0,0 +1,137 @@ +#!/bin/bash + +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# vLLM Embedding Server with Enhanced Chunked Processing +# This script starts a vLLM server with chunked processing enabled for long text embedding. +# Now supports proper pooling type validation and model-specific configurations. + +set -euo pipefail + +# Configuration +MODEL_NAME=${MODEL_NAME:-"intfloat/multilingual-e5-large"} +MODEL_CODE=${MODEL_CODE:-"multilingual-e5-large"} + +PORT=${PORT:-31090} +GPU_COUNT=${GPU_COUNT:-1} +MAX_EMBED_LEN=${MAX_EMBED_LEN:-3072000} +API_KEY=${API_KEY:-"your-api-key"} + +# Enhanced pooling configuration with model-specific defaults +POOLING_TYPE=${POOLING_TYPE:-"auto"} # auto, MEAN, CLS, LAST +export VLLM_ENABLE_CHUNKED_PROCESSING=true +export CUDA_VISIBLE_DEVICES=2,3,4,5 +# export VLLM_ATTENTION_BACKEND=XFORMERS + +echo "🚀 Starting vLLM Embedding Server with Enhanced Chunked Processing" +echo "==================================================================" + +# Environment variables for optimization +export VLLM_WORKER_MULTIPROC_METHOD=spawn + +# Function to determine optimal pooling type for known models +get_optimal_pooling_type() { + local model="$1" + case "$model" in + *"e5-"* | *"multilingual-e5"*) + echo "MEAN" # E5 series native pooling + ;; + *"bge-"*) + echo "CLS" # BGE series native pooling + ;; + *"gte-"*) + echo "LAST" # GTE series native pooling + ;; + *"sentence-t5"* | *"st5"*) + echo "MEAN" # Sentence-T5 native pooling + ;; + *"jina-embeddings"*) + echo "MEAN" # Jina embeddings native pooling + ;; + *"Qwen"*"Embedding"*) + echo "LAST" # Qwen embeddings native pooling + ;; + *) + echo "MEAN" # Default native pooling for unknown models + ;; + esac +} + +# Auto-detect pooling type if not explicitly set +if [ "$POOLING_TYPE" = "auto" ]; then + POOLING_TYPE=$(get_optimal_pooling_type "$MODEL_NAME") + echo "🔍 Auto-detected pooling type: $POOLING_TYPE for model $MODEL_NAME" +fi + +# Display configuration +echo "📋 Configuration:" +echo " - Model: $MODEL_NAME" +echo " - Port: $PORT" +echo " - GPU Count: $GPU_COUNT" +echo " - Enhanced Chunked Processing: ${VLLM_ENABLE_CHUNKED_PROCESSING}" +echo " - Max Embed Length: ${MAX_EMBED_LEN} tokens" +echo " - Native Pooling Type: $POOLING_TYPE + Normalization" +echo " - Cross-chunk Aggregation: MEAN (automatic)" +echo "" + +# Validate GPU availability +if command -v nvidia-smi &> /dev/null; then + gpu_count=$(nvidia-smi --list-gpus | wc -l) + echo "🖥️ Available GPUs: $gpu_count" + if [ "$GPU_COUNT" -gt "$gpu_count" ]; then + echo "⚠️ Warning: Requested $GPU_COUNT GPUs but only $gpu_count available" + echo " Adjusting to use $gpu_count GPUs" + GPU_COUNT=$gpu_count + fi +else + echo "⚠️ Warning: nvidia-smi not found. GPU detection skipped." +fi + +# Chunked processing uses unified MEAN aggregation +echo "ℹ️ Chunked Processing: Using $POOLING_TYPE pooling within chunks, MEAN aggregation across chunks" +echo " - All chunks processed for complete semantic coverage" +echo " - Weighted averaging based on chunk token counts" + +echo "" +echo "🔧 Starting server with enhanced chunked processing configuration..." + +# Build pooler config JSON +POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}" + +# Start vLLM server with enhanced chunked processing +vllm serve "$MODEL_NAME" \ + --tensor-parallel-size "$GPU_COUNT" \ + --enforce-eager \ + --override-pooler-config "$POOLER_CONFIG" \ + --served-model-name ${MODEL_CODE} \ + --api-key "$API_KEY" \ + --trust-remote-code \ + --port "$PORT" \ + --host 0.0.0.0 + +echo "" +echo "✅ vLLM Embedding Server started successfully!" +echo "" +echo "📡 Server Information:" +echo " - Base URL: http://localhost:$PORT" +echo " - Model Code: ${MODEL_CODE}" +echo " - API Key: $API_KEY" +echo " - Native Pooling: $POOLING_TYPE | Cross-chunk: MEAN" +echo "" +echo "🧪 Test the server with:" +echo " python examples/online_serving/openai_embedding_long_text_client.py" +echo "" +echo "📚 Enhanced features enabled:" +echo " ✅ Intelligent native pooling type detection" +echo " ✅ Unified MEAN aggregation for chunked processing" +echo " ✅ Model-specific native pooling optimization" +echo " ✅ Enhanced max embedding length (${MAX_EMBED_LEN} tokens)" +echo " ✅ Complete semantic coverage for all pooling types" +echo " ✅ OpenAI-compatible API" +echo " ✅ GPU acceleration" +echo "" +echo "🔧 Advanced usage:" +echo " - Set POOLING_TYPE=MEAN|CLS|LAST to override auto-detection" +echo " - Set MAX_EMBED_LEN to adjust maximum input length" +echo " - All pooling types use MEAN aggregation across chunks" diff --git a/tests/entrypoints/openai/test_embedding_long_text.py b/tests/entrypoints/openai/test_embedding_long_text.py new file mode 100644 index 0000000000..86bd34abb9 --- /dev/null +++ b/tests/entrypoints/openai/test_embedding_long_text.py @@ -0,0 +1,441 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Test cases for long text embedding with automatic chunking mechanism. + +This test suite validates vLLM's automatic chunking functionality for handling +text inputs that exceed the model's maximum token length, specifically targeting +the intfloat/multilingual-e5-small model (max token length: 512). +""" + +import random + +import openai +import pytest +import pytest_asyncio + +from vllm.entrypoints.openai.protocol import EmbeddingResponse + +from ...utils import RemoteOpenAIServer + + +def _generate_random_text(word_count: int) -> str: + """Generate random text with approximately the specified word count.""" + # Common English words with focus on verbs and nouns for realistic text + common_words = [ + # Essential articles and pronouns (minimal) + "the", + "and", + "you", + "they", + "this", + "that", + "these", + "those", + + # Action verbs + "create", + "build", + "develop", + "design", + "implement", + "execute", + "analyze", + "process", + "generate", + "calculate", + "evaluate", + "optimize", + "transform", + "integrate", + "configure", + "deploy", + "monitor", + "manage", + "discover", + "explore", + "investigate", + "research", + "study", + "examine", + "improve", + "enhance", + "upgrade", + "modify", + "update", + "maintain", + "solve", + "resolve", + "handle", + "address", + "tackle", + "overcome", + "communicate", + "collaborate", + "coordinate", + "organize", + "plan", + "achieve", + "accomplish", + "complete", + "finish", + "deliver", + "provide", + + # Technology and science nouns + "system", + "application", + "software", + "hardware", + "network", + "database", + "algorithm", + "model", + "framework", + "platform", + "interface", + "protocol", + "architecture", + "infrastructure", + "component", + "module", + "service", + "technology", + "innovation", + "solution", + "methodology", + "approach", + "artificial", + "intelligence", + "machine", + "learning", + "neural", + "network", + "computer", + "processor", + "memory", + "storage", + "computation", + "data", + "information", + "knowledge", + "insight", + "pattern", + "trend", + "analysis", + "research", + "development", + "engineering", + "science", + "mathematics", + "statistics", + "probability", + "optimization", + "performance", + "efficiency", + + # General nouns + "project", + "team", + "organization", + "company", + "business", + "industry", + "market", + "customer", + "user", + "client", + "product", + "feature", + "function", + "requirement", + "specification", + "documentation", + "report", + "result", + "outcome", + "impact", + "benefit", + "advantage", + "challenge", + "problem", + "opportunity", + "strategy", + "goal", + "objective", + "target", + "milestone", + "process", + "procedure", + "workflow", + "pipeline", + "operation", + "task", + "activity", + "event", + "session", + "meeting", + "discussion", + "decision" + ] + + words = [] + for _ in range(word_count): + words.append(random.choice(common_words)) + + # Add some punctuation for more realistic text + text = " ".join(words) + # Add periods every 10-20 words + words_list = text.split() + result = [] + for i, word in enumerate(words_list): + result.append(word) + if ((i + 1) % random.randint(10, 20) == 0 and i < len(words_list) - 1): + result[-1] += "." + + return " ".join(result) + + +MODEL_NAME = "intfloat/multilingual-e5-small" +DTYPE = "bfloat16" + +# Test text: Generate text with approximately 1500 words to exceed 1024 tokens +LONG_TEXT_1500_WORDS = _generate_random_text(1500) + +# Test text: Generate text with approximately 2500 words to exceed 2048 tokens +LONG_TEXT_2500_WORDS = _generate_random_text(2500) + + +@pytest.fixture(scope="module") +def server_with_chunked_processing(): + """Start server with automatic chunking processing enabled.""" + args = [ + "--runner", + "pooling", + "--dtype", + DTYPE, + "--enforce-eager", + "--max-model-len", + "512", # Set smaller max_model_len to trigger chunking mechanism + '--override-pooler-config', + ('{"pooling_type": "MEAN", "normalize": true, ' + '"enable_chunked_processing": true, "max_embed_len": 10000}'), + "--gpu-memory-utilization", + "0.8", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client_with_chunked_processing(server_with_chunked_processing): + """Create async client with chunking processing support.""" + async with server_with_chunked_processing.get_async_client( + ) as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_long_text_embedding_1500_chars( + client_with_chunked_processing: openai.AsyncOpenAI, model_name: str): + """Test embedding processing for ~1500 character long text + (~1028 tokens, exceeding 512 token limit).""" + + # Verify text length + # Verify text has sufficient word count (approximately 1500 words) + word_count = len(LONG_TEXT_1500_WORDS.split()) + assert word_count >= 1400, ( + f"Test text word count insufficient: {word_count} words") + + # Send embedding request + embedding_response = await client_with_chunked_processing.embeddings.create( + model=model_name, + input=[LONG_TEXT_1500_WORDS], + encoding_format="float", + ) + + # Verify response structure + embeddings = EmbeddingResponse.model_validate( + embedding_response.model_dump(mode="json")) + + assert embeddings.id is not None + assert len(embeddings.data) == 1 + assert len(embeddings.data[0].embedding + ) == 384 # multilingual-e5-small embedding dimension + assert embeddings.usage.completion_tokens == 0 + # Due to chunked processing, token count should + # reflect actual processed tokens + # With ~1500 words, we expect roughly + # 1024+ tokens (exceeding 512 token limit) + # Should exceed single chunk limit of 512 + assert embeddings.usage.prompt_tokens > 800 + assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens + + # Verify embedding vector validity + embedding_vector = embeddings.data[0].embedding + assert all( + isinstance(x, float) + for x in embedding_vector), "Embedding vector should contain floats" + assert not all( + x == 0 + for x in embedding_vector), "Embedding vector should not be all zeros" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_long_text_embedding_2500_chars( + client_with_chunked_processing: openai.AsyncOpenAI, model_name: str): + """Test embedding processing for ~2500 character long text + (~2048 tokens, requiring multiple chunks).""" + + # Verify text length + # Verify text has sufficient word count (approximately 2500 words) + word_count = len(LONG_TEXT_2500_WORDS.split()) + assert word_count >= 2300, ( + f"Test text word count insufficient: {word_count} words") + + # Send embedding request + embedding_response = await client_with_chunked_processing.embeddings.create( + model=model_name, + input=[LONG_TEXT_2500_WORDS], + encoding_format="float", + ) + + # Verify response structure + embeddings = EmbeddingResponse.model_validate( + embedding_response.model_dump(mode="json")) + + assert embeddings.id is not None + assert len(embeddings.data) == 1 + assert len(embeddings.data[0].embedding + ) == 384 # multilingual-e5-small embedding dimension + assert embeddings.usage.completion_tokens == 0 + # Due to chunked processing, token count should + # reflect actual processed tokens + # With ~2500 words, we expect + # roughly 2048+ tokens (requiring multiple chunks) + # Should require multiple chunks for processing + assert embeddings.usage.prompt_tokens > 1500 + assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens + + # Verify embedding vector validity + embedding_vector = embeddings.data[0].embedding + assert all( + isinstance(x, float) + for x in embedding_vector), "Embedding vector should contain floats" + assert not all( + x == 0 + for x in embedding_vector), "Embedding vector should not be all zeros" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_batch_long_text_embedding( + client_with_chunked_processing: openai.AsyncOpenAI, model_name: str): + """Test batch long text embedding processing.""" + + input_texts = [ + LONG_TEXT_1500_WORDS, + LONG_TEXT_2500_WORDS, + "This is a short text test.", # Short text for comparison + ] + + # Send batch embedding request + embedding_response = await client_with_chunked_processing.embeddings.create( + model=model_name, + input=input_texts, + encoding_format="float", + ) + + # Verify response structure + embeddings = EmbeddingResponse.model_validate( + embedding_response.model_dump(mode="json")) + + assert embeddings.id is not None + assert len(embeddings.data) == 3 # Three input texts + + # Verify each embedding dimension + for i, embedding_data in enumerate(embeddings.data): + assert len(embedding_data.embedding) == 384 + assert embedding_data.index == i + + # Verify embedding vector validity + embedding_vector = embedding_data.embedding + assert all(isinstance(x, float) for x in embedding_vector) + assert not all(x == 0 for x in embedding_vector) + + # Verify token usage + assert embeddings.usage.completion_tokens == 0 + # Total token count should be very substantial + assert embeddings.usage.prompt_tokens > 1000 + assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_chunked_vs_normal_consistency( + client_with_chunked_processing: openai.AsyncOpenAI, model_name: str): + """Test consistency between chunked and + normal processing (using short text).""" + + # Use a short text within the 512 token limit + short_text = ("Artificial intelligence technology is changing our world, " + "bringing unprecedented opportunities and challenges.") + + # Send embedding request + embedding_response = await client_with_chunked_processing.embeddings.create( + model=model_name, + input=[short_text], + encoding_format="float", + ) + + # Verify response structure + embeddings = EmbeddingResponse.model_validate( + embedding_response.model_dump(mode="json")) + + assert embeddings.id is not None + assert len(embeddings.data) == 1 + assert len(embeddings.data[0].embedding) == 384 + assert embeddings.usage.completion_tokens == 0 + # Short text should not require chunked processing + assert embeddings.usage.prompt_tokens < 512 + assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens + + # 验证embedding向量的有效性 + embedding_vector = embeddings.data[0].embedding + assert all(isinstance(x, float) for x in embedding_vector) + assert not all(x == 0 for x in embedding_vector) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_chunked_processing_response_format( + client_with_chunked_processing: openai.AsyncOpenAI, model_name: str): + """Test response format and structure during chunked processing.""" + + # Test with long text to trigger chunking + embedding_response = await client_with_chunked_processing.embeddings.create( + model=model_name, + input=[LONG_TEXT_1500_WORDS], + encoding_format="float", + ) + + # Verify response structure + embeddings = EmbeddingResponse.model_validate( + embedding_response.model_dump(mode="json")) + + assert embeddings.id is not None + assert len(embeddings.data) == 1 + assert embeddings.data[0].object == "embedding" + assert embeddings.data[0].index == 0 + + # Verify embedding vector properties + embedding_vector = embeddings.data[0].embedding + import math + vector_norm = math.sqrt(sum(x * x for x in embedding_vector)) + # Check that the vector is normalized + # (default behavior for most embedding models) + assert 0.8 < vector_norm < 1.2, ( + f"Vector norm should be reasonable, actual: {vector_norm}") diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 6649cd89ee..b4ea15ef5a 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -2598,6 +2598,25 @@ class PoolerConfig: ``math-shepherd-mistral-7b-prm`` model. """ + enable_chunked_processing: Optional[bool] = None + """ + Whether to enable chunked processing for long inputs that exceed the model's + maximum position embeddings. When enabled, long inputs will be split into + chunks, processed separately, and then aggregated using weighted averaging. + This allows embedding models to handle arbitrarily long text without CUDA + errors. Defaults to False. + """ + + max_embed_len: Optional[int] = None + """ + Maximum input length allowed for embedding generation. When set, allows + inputs longer than max_embed_len to be accepted for embedding models. + This parameter enables accepting long inputs without requiring + VLLM_ALLOW_LONG_MAX_MODEL_LEN environment variable. When an input exceeds + max_embed_len, it will be handled according to the original max_model_len + validation logic. Defaults to None (i.e. set to max_model_len). + """ + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 84ba008731..9dcad8e391 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -2,9 +2,11 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 -from typing import Final, Literal, Optional, Union, cast +from collections.abc import AsyncGenerator, Mapping +from typing import Any, Final, Literal, Optional, Union, cast import numpy as np +import torch from fastapi import Request from typing_extensions import assert_never, override @@ -12,19 +14,28 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.logger import RequestLogger +# yapf conflicts with isort for this docstring +# yapf: disable from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest, + EmbeddingCompletionRequest, EmbeddingRequest, EmbeddingResponse, EmbeddingResponseData, ErrorResponse, UsageInfo) from vllm.entrypoints.openai.serving_engine import (EmbeddingServeContext, OpenAIServing, - ServeContext) + RequestPrompt, + ServeContext, + TextTokensPrompt) +# yapf: enable from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt +from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.logger import init_logger from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput, - PoolingRequestOutput) + PoolingOutput, PoolingRequestOutput, RequestOutput) from vllm.pooling_params import PoolingParams +from vllm.utils import chunk_list logger = init_logger(__name__) @@ -46,6 +57,17 @@ def _get_embedding( class EmbeddingMixin(OpenAIServing): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + pooler_config = self.model_config.pooler_config + + # Avoid repeated attribute lookups + self.supports_chunked_processing = bool( + pooler_config and pooler_config.enable_chunked_processing) + self.max_embed_len = (pooler_config.max_embed_len if pooler_config + and pooler_config.max_embed_len else None) + @override async def _preprocess( self, @@ -129,6 +151,435 @@ class EmbeddingMixin(OpenAIServing): usage=usage, ) + def _get_max_position_embeddings(self) -> int: + """Get the model's effective maximum sequence length for chunking.""" + return self.model_config.max_model_len + + def _should_use_chunked_processing(self, request) -> bool: + """Check if chunked processing should be used for this request.""" + return isinstance( + request, + (EmbeddingCompletionRequest, + EmbeddingChatRequest)) and self.supports_chunked_processing + + async def _process_chunked_request( + self, + ctx: EmbeddingServeContext, + original_prompt: TextTokensPrompt, + pooling_params, + trace_headers, + prompt_idx: int, + ) -> list[AsyncGenerator[PoolingRequestOutput, None]]: + """Process a single prompt using chunked processing.""" + generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] + token_ids = original_prompt["prompt_token_ids"] + + # Split into chunks using max_position_embeddings + max_pos_embeddings = self._get_max_position_embeddings() + # Process all chunks for MEAN aggregation + for chunk_idx, chunk_tokens in enumerate( + chunk_list(token_ids, max_pos_embeddings)): + # Create a request ID for this chunk + chunk_request_id = (f"{ctx.request_id}-prompt-{prompt_idx}-" + f"chunk-{chunk_idx}") + + # Create engine prompt for this chunk + chunk_engine_prompt = EngineTokensPrompt( + prompt_token_ids=chunk_tokens) + + # Create chunk request prompt for logging + chunk_text = "" + chunk_request_prompt = TextTokensPrompt( + prompt=chunk_text, prompt_token_ids=chunk_tokens) + + # Log the chunk + self._log_inputs(chunk_request_id, + chunk_request_prompt, + params=pooling_params, + lora_request=ctx.lora_request) + + # Create generator for this chunk and wrap it to return indices + original_generator = self.engine_client.encode( + chunk_engine_prompt, + pooling_params, + chunk_request_id, + lora_request=ctx.lora_request, + trace_headers=trace_headers, + priority=getattr(ctx.request, "priority", 0), + ) + + generators.append(original_generator) + + return generators + + def _validate_input( + self, + request, + input_ids: list[int], + input_text: str, + ) -> TextTokensPrompt: + """Override to support chunked processing for embedding requests.""" + token_num = len(input_ids) + + # Note: EmbeddingRequest doesn't have max_tokens + if isinstance(request, + (EmbeddingCompletionRequest, EmbeddingChatRequest)): + # Check if chunked processing is enabled for pooling models + enable_chunked = self._should_use_chunked_processing(request) + + # Use max_position_embeddings for chunked processing decisions + max_pos_embeddings = self._get_max_position_embeddings() + + # Determine the effective max length for validation + if self.max_embed_len is not None: + # Use max_embed_len for validation instead of max_model_len + length_type = "maximum embedding input length" + max_length_value = self.max_embed_len + else: + # Fall back to max_model_len validation (original behavior) + length_type = "maximum context length" + max_length_value = self.max_model_len + + validation_error_msg = ( + "This model's {length_type} is {max_length_value} tokens. " + "However, you requested {token_num} tokens in the input for " + "embedding generation. Please reduce the length of the input.") + + chunked_processing_error_msg = ( + "This model's {length_type} is {max_length_value} tokens. " + "However, you requested {token_num} tokens in the input for " + "embedding generation. Please reduce the length of the input " + "or enable chunked processing.") + + # Check if input exceeds max length + if token_num > max_length_value: + raise ValueError( + validation_error_msg.format( + length_type=length_type, + max_length_value=max_length_value, + token_num=token_num)) + + # Check for chunked processing + # when exceeding max_position_embeddings + if token_num > max_pos_embeddings: + if enable_chunked: + # Allow long inputs when chunked processing is enabled + logger.info( + "Input length %s exceeds max_position_embeddings " + "%s, will use chunked processing", token_num, + max_pos_embeddings) + else: + raise ValueError( + chunked_processing_error_msg.format( + length_type="maximum position embeddings length", + max_length_value=max_pos_embeddings, + token_num=token_num)) + + return TextTokensPrompt(prompt=input_text, + prompt_token_ids=input_ids) + + # For other request types, use the parent's implementation + return super()._validate_input(request, input_ids, input_text) + + def _is_text_tokens_prompt(self, prompt) -> bool: + """Check if a prompt is a TextTokensPrompt (has prompt_token_ids).""" + return (isinstance(prompt, dict) and "prompt_token_ids" in prompt + and "prompt_embeds" not in prompt) + + async def _create_single_prompt_generator( + self, + ctx: EmbeddingServeContext, + engine_prompt: Union[EngineTokensPrompt, EngineEmbedsPrompt], + request_prompt: RequestPrompt, + pooling_params: PoolingParams, + trace_headers: Optional[Mapping[str, str]], + prompt_index: int, + ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: + """Create a generator for a single prompt using standard processing.""" + request_id_item = f"{ctx.request_id}-{prompt_index}" + + self._log_inputs(request_id_item, + request_prompt, + params=pooling_params, + lora_request=ctx.lora_request) + + # Mypy has an existing bug related to inferring the variance + # of TypedDicts with `builtins.enumerate`: + # https://github.com/python/mypy/issues/8586#issuecomment-2867698435 + engine_prompt = cast(Union[EngineTokensPrompt, EngineEmbedsPrompt], + engine_prompt) + + # Return the original generator without wrapping + return self.engine_client.encode( + engine_prompt, + pooling_params, + request_id_item, + lora_request=ctx.lora_request, + trace_headers=trace_headers, + priority=getattr(ctx.request, "priority", 0), + ) + + @override + async def _prepare_generators( + self, + ctx: ServeContext, + ) -> Optional[ErrorResponse]: + """Override to support chunked processing.""" + ctx = cast(EmbeddingServeContext, ctx) + + # Check if we should use chunked processing + use_chunked = self._should_use_chunked_processing(ctx.request) + + # If no chunked processing needed, delegate to parent class + if not use_chunked: + return await super()._prepare_generators(ctx) + + # Custom logic for chunked processing + generators: list[AsyncGenerator[Union[RequestOutput, + PoolingRequestOutput], + None]] = [] + + try: + trace_headers = (None if ctx.raw_request is None else await + self._get_trace_headers(ctx.raw_request.headers)) + + pooling_params = self._create_pooling_params(ctx) + if isinstance(pooling_params, ErrorResponse): + return pooling_params + + # Verify and set the task for pooling params + try: + pooling_params.verify("embed", self.model_config) + except ValueError as e: + return self.create_error_response(str(e)) + + if ctx.engine_prompts is None: + return self.create_error_response( + "Engine prompts not available") + + if ctx.request_prompts is None: + return self.create_error_response( + "Request prompts not available") + + max_pos_embeddings = self._get_max_position_embeddings() + + for i, engine_prompt in enumerate(ctx.engine_prompts): + request_prompt = ctx.request_prompts[i] + + # Check if this specific prompt needs chunked processing + if self._is_text_tokens_prompt(request_prompt): + # Cast to TextTokensPrompt since we've verified + # prompt_token_ids + text_tokens_prompt = cast(TextTokensPrompt, request_prompt) + if (len(text_tokens_prompt["prompt_token_ids"]) + > max_pos_embeddings): + # Use chunked processing for this prompt + chunk_generators = await self._process_chunked_request( + ctx, text_tokens_prompt, pooling_params, + trace_headers, i) + generators.extend(chunk_generators) + continue + + # Normal processing for short prompts or non-token prompts + # Cast engine_prompt to the expected type for mypy + engine_prompt_typed = cast( + Union[EngineTokensPrompt, EngineEmbedsPrompt], + engine_prompt) + generator = await self._create_single_prompt_generator( + ctx, engine_prompt_typed, request_prompt, pooling_params, + trace_headers, i) + generators.append(generator) + + from vllm.utils import merge_async_iterators + ctx.result_generator = merge_async_iterators(*generators) + + return None + + except Exception as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + @override + async def _collect_batch( + self, + ctx: ServeContext, + ) -> Optional[ErrorResponse]: + """Collect and aggregate batch results + with support for chunked processing. + + For chunked requests, performs online aggregation to + minimize memory usage. + For regular requests, collects results normally. + """ + ctx = cast(EmbeddingServeContext, ctx) + try: + if ctx.engine_prompts is None: + return self.create_error_response( + "Engine prompts not available") + + # Check if we used chunked processing + use_chunked = self._should_use_chunked_processing(ctx.request) + + if not use_chunked: + return await super()._collect_batch(ctx=ctx) + + if ctx.request_prompts is None: + return self.create_error_response( + "Request prompts not available") + + if ctx.result_generator is None: + return self.create_error_response( + "Result generator not available") + + # Online aggregation for chunked requests to + # minimize memory usage + # Track aggregation state for each prompt + prompt_aggregators: dict[int, dict[str, Any]] = {} + short_prompts_results: dict[int, PoolingRequestOutput] = {} + + async for result_idx, result in ctx.result_generator: + if "-chunk-" in result.request_id: + # Extract prompt_idx from chunked request_id + parts = result.request_id.split("-") + try: + prompt_idx = int(parts[parts.index("prompt") + 1]) + except (ValueError, IndexError): + # Fallback: extract from result_idx if parsing fails + prompt_idx = result_idx + + # Initialize aggregator for this prompt if needed + if prompt_idx not in prompt_aggregators: + prompt_aggregators[prompt_idx] = { + 'weighted_sum': None, + 'total_weight': 0, + 'chunk_count': 0, + 'request_id': result.request_id.split("-chunk-")[0] + } + + aggregator = prompt_aggregators[prompt_idx] + + # MEAN pooling with online weighted averaging + # Ensure result is PoolingRequestOutput + # for embedding processing + if not isinstance(result, PoolingRequestOutput): + return self.create_error_response( + f"Expected PoolingRequestOutput for " + f"chunked embedding, got " + f"{type(result).__name__}") + + # Handle both PoolingOutput and + # EmbeddingOutput types + if hasattr(result.outputs, 'data'): + # PoolingOutput case + embedding_data = result.outputs.data + elif hasattr(result.outputs, 'embedding'): + # EmbeddingOutput case - + # convert embedding list to tensor + embedding_data = result.outputs.embedding + else: + return self.create_error_response( + f"Unsupported output type: " + f"{type(result.outputs).__name__}") + + if not isinstance(embedding_data, torch.Tensor): + embedding_data = torch.tensor(embedding_data, + dtype=torch.float32) + + if result.prompt_token_ids is None: + return self.create_error_response( + "prompt_token_ids cannot be None for " + "chunked processing") + weight = len(result.prompt_token_ids) + + weighted_embedding = embedding_data.to( + dtype=torch.float32) * weight + + if aggregator['weighted_sum'] is None: + # First chunk + aggregator['weighted_sum'] = weighted_embedding + else: + # Accumulate + aggregator['weighted_sum'] += weighted_embedding + + aggregator['total_weight'] += weight + aggregator['chunk_count'] += 1 + else: + # Non-chunked result - extract prompt_idx from request_id + parts = result.request_id.split("-") + try: + # Last part should be prompt index + prompt_idx = int(parts[-1]) + except (ValueError, IndexError): + prompt_idx = result_idx # Fallback to result_idx + + short_prompts_results[prompt_idx] = cast( + PoolingRequestOutput, result) + + # Finalize aggregated results + final_res_batch: list[Union[PoolingRequestOutput, + EmbeddingRequestOutput]] = [] + num_prompts = len(ctx.engine_prompts) + + for prompt_idx in range(num_prompts): + if prompt_idx in prompt_aggregators: + # Finalize MEAN aggregation for this chunked prompt + aggregator = prompt_aggregators[prompt_idx] + + weighted_sum = aggregator['weighted_sum'] + total_weight = aggregator['total_weight'] + + if (weighted_sum is not None + and isinstance(weighted_sum, torch.Tensor) + and isinstance(total_weight, + (int, float)) and total_weight > 0): + + # Compute final mean embedding + final_embedding = weighted_sum / total_weight + + # Create a PoolingRequestOutput + # for the aggregated result + pooling_output_data = PoolingOutput( + data=final_embedding) + + # Get original prompt token IDs for this prompt + original_prompt = ctx.request_prompts[prompt_idx] + if not self._is_text_tokens_prompt(original_prompt): + return self.create_error_response( + f"Chunked prompt {prompt_idx} is not a " + f"TextTokensPrompt") + + original_token_ids = cast( + TextTokensPrompt, + original_prompt)["prompt_token_ids"] + + pooling_request_output = PoolingRequestOutput( + request_id=aggregator['request_id'], + prompt_token_ids=original_token_ids, + outputs=pooling_output_data, + finished=True) + + final_res_batch.append(pooling_request_output) + else: + return self.create_error_response( + f"Failed to aggregate chunks " + f"for prompt {prompt_idx}") + elif prompt_idx in short_prompts_results: + final_res_batch.append( + cast(PoolingRequestOutput, + short_prompts_results[prompt_idx])) + else: + return self.create_error_response( + f"Result not found for prompt {prompt_idx}") + + ctx.final_res_batch = cast( + list[Union[RequestOutput, PoolingRequestOutput]], + final_res_batch) + + return None + + except Exception as e: + return self.create_error_response(str(e)) + class OpenAIServingEmbedding(EmbeddingMixin): request_id_prefix = "embd" From 98deac3879860b829dd9a30b19bbb2adb9c96e7f Mon Sep 17 00:00:00 2001 From: Chi Zhang Date: Wed, 13 Aug 2025 20:27:25 +0800 Subject: [PATCH 233/932] [FEATURE] support custom vllm tuned config path for fused moe triton kernels (#22791) Signed-off-by: Chi Zhang --- vllm/envs.py | 6 ++++ .../layers/fused_moe/fused_moe.py | 28 +++++++++++++------ 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 931edcfa7f..e7796aa73d 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -158,6 +158,7 @@ if TYPE_CHECKING: VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False + VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None def get_default_cache_root(): @@ -1120,6 +1121,11 @@ environment_variables: dict[str, Callable[[], Any]] = { # never removed from memory until the server terminates. "VLLM_ENABLE_RESPONSES_API_STORE": lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))), + + # Allows vllm to find tuned config under customized folder + "VLLM_TUNED_CONFIG_FOLDER": + lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None), + } # --8<-- [end:env-vars-definition] diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index ad094c37f9..98087a35e1 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -701,20 +701,32 @@ def get_moe_configs( block_shape = [block_n, block_k] if block_n and block_k else None json_file_name = get_config_file_name(E, N, dtype, block_shape) - config_file_path = os.path.join( + config_file_paths = [] + + # note that we prioritize user defined config + user_defined_config_folder = envs.VLLM_TUNED_CONFIG_FOLDER + if user_defined_config_folder is not None: + user_defined_config_file_path = os.path.join( + user_defined_config_folder, json_file_name) + config_file_paths.append(user_defined_config_file_path) + + default_config_file_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name) - if os.path.exists(config_file_path): - with open(config_file_path) as f: - logger.info("Using configuration from %s for MoE layer.", - config_file_path) - # If a configuration has been found, return it - return {int(key): val for key, val in json.load(f).items()} + config_file_paths.append(default_config_file_path) + + for config_file_path in config_file_paths: + if os.path.exists(config_file_path): + with open(config_file_path) as f: + logger.info("Using configuration from %s for MoE layer.", + config_file_path) + # If a configuration has been found, return it + return {int(key): val for key, val in json.load(f).items()} # If no optimized configuration is available, we will use the default # configuration logger.warning( ("Using default MoE config. Performance might be sub-optimal! " - "Config file not found at %s"), config_file_path) + "Config file not found at %s"), config_file_paths) return None From 6b794c756c5a6b3c443c19a093435d02d91d525f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Wed, 13 Aug 2025 15:03:53 +0200 Subject: [PATCH 234/932] [Nixl][CI] Fix tests (#22806) Signed-off-by: NickLucche --- tests/v1/kv_connector/unit/test_nixl_connector.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 3860d7c857..b185936ab0 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -229,6 +229,9 @@ class FakeNixlConnectorWorker(NixlConnectorWorker): num_blocks=1, block_len=self.block_len, attn_backend_name=self.backend_name, + # `self.kv_cache_layout` is only forced to HND when vllm engine + # is started. We mock HND here. + kv_cache_layout="HND", ), remote_tp_size=remote_tp_size) return {0: remote_agent_name} From fceafaf582cd72e6636f47127a665afb9e0ea0aa Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 13 Aug 2025 06:07:09 -0700 Subject: [PATCH 235/932] [Bugfix][mamba] Fix type annotation of Mamba2Metadata (#22787) Signed-off-by: Chen Zhang --- .../layers/mamba/mamba_mixer2.py | 8 ++-- vllm/v1/attention/backends/mamba_attn.py | 39 +++++++++++-------- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index d5f4877135..10a5618c22 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -473,12 +473,12 @@ class MambaMixer2(MambaBase, CustomOp): conv_state = self_kv_cache[0].transpose(-1, -2) ssm_state = self_kv_cache[1] state_indices_tensor = attn_metadata.state_indices_tensor - has_initial_states_p = attn_metadata.has_initial_states + has_initial_states_p = attn_metadata.has_initial_states_p prep_initial_states = attn_metadata.prep_initial_states chunk_size = attn_metadata.chunk_size - seq_idx_p = attn_metadata.seq_idx - chunk_indices_p = attn_metadata.chunk_indices - chunk_offsets_p = attn_metadata.chunk_offsets + seq_idx_p = attn_metadata.seq_idx_p + chunk_indices_p = attn_metadata.chunk_indices_p + chunk_offsets_p = attn_metadata.chunk_offsets_p else: conv_state = mamba_cache_params.conv_state ssm_state = mamba_cache_params.ssm_state diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py index 7c1226049f..3f84f8967d 100644 --- a/vllm/v1/attention/backends/mamba_attn.py +++ b/vllm/v1/attention/backends/mamba_attn.py @@ -68,14 +68,19 @@ class Mamba2AttentionMetadata: query_start_loc: torch.Tensor seq_lens: torch.Tensor - has_initial_states: torch.Tensor prep_initial_states: bool chunk_size: int - seq_idx: torch.Tensor - chunk_indices: torch.Tensor - chunk_offsets: torch.Tensor + + # The following tensors only contain prefill requests and will be None if + # the batch has no prefill request. + has_initial_states_p: Optional[torch.Tensor] + seq_idx_p: Optional[torch.Tensor] + chunk_indices_p: Optional[torch.Tensor] + chunk_offsets_p: Optional[torch.Tensor] state_indices_tensor: torch.Tensor # shape: [batch,] + + # The following attributes are for triton implementation of causal_conv1d nums_dict: Optional[dict] = None cu_seqlen: Optional[int] = None batch_ptr: Optional[torch.tensor] = None @@ -115,11 +120,11 @@ class Mamba2AttentionMetadataBuilder( query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens - seq_idx = None - chunk_indices, chunk_offsets = None, None + seq_idx_p = None + chunk_indices_p, chunk_offsets_p = None, None # Need flags to indicate if there are initial states # currently we really only support the FlashAttention backend - has_initial_states = None + has_initial_states_p = None prep_initial_states = False state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] @@ -135,25 +140,25 @@ class Mamba2AttentionMetadataBuilder( common_attn_metadata. num_computed_tokens_cpu[num_reqs - num_prefills:num_reqs] > 0) prep_initial_states = torch.any(has_initial_states_cpu).item() - has_initial_states = has_initial_states_cpu.to( + has_initial_states_p = has_initial_states_cpu.to( query_start_loc.device) query_start_loc_p = common_attn_metadata.query_start_loc[ -num_prefills - 1:] - num_decode_tokens - seq_idx = torch.repeat_interleave(torch.arange( + seq_idx_p = torch.repeat_interleave(torch.arange( num_prefills, dtype=torch.int32, device=query_start_loc_p.device), - query_start_loc_p.diff(), - output_size=num_prefill_tokens) - seq_idx.unsqueeze_(0) + query_start_loc_p.diff(), + output_size=num_prefill_tokens) + seq_idx_p.unsqueeze_(0) # We compute metadata for chunked prefill once at the top level # model forward and reuse them in mamba layers. If not needed, # they will be ignored inside mamba kernels. if prep_initial_states: - chunk_indices, chunk_offsets = ( + chunk_indices_p, chunk_offsets_p = ( _query_start_loc_to_chunk_indices_offsets( query_start_loc_p, self.chunk_size, num_prefill_tokens)) @@ -173,12 +178,12 @@ class Mamba2AttentionMetadataBuilder( num_decode_tokens=num_decode_tokens, query_start_loc=query_start_loc, seq_lens=seq_lens, - has_initial_states=has_initial_states, prep_initial_states=prep_initial_states, chunk_size=self.chunk_size, - seq_idx=seq_idx, - chunk_indices=chunk_indices, - chunk_offsets=chunk_offsets, + has_initial_states_p=has_initial_states_p, + seq_idx_p=seq_idx_p, + chunk_indices_p=chunk_indices_p, + chunk_offsets_p=chunk_offsets_p, state_indices_tensor=state_indices_tensor, ) return attn_metadata From 6772bb0f7d58d137576f386dd921117a5a00f0fb Mon Sep 17 00:00:00 2001 From: Yuanyuan Chen Date: Wed, 13 Aug 2025 21:07:28 +0800 Subject: [PATCH 236/932] Remove unnecessary CUDA sync of qwen image and video preprocess (#22792) Signed-off-by: cyy Signed-off-by: Yuanyuan Chen Co-authored-by: Cyrus Leung --- vllm/model_executor/models/qwen2_5_vl.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 6bea180ffe..5bcbcc4f0e 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -976,10 +976,12 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) # Split concatenated embeddings for each image item. + # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size + sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) // + (merge_size * merge_size)).tolist() - return image_embeds.split(sizes.tolist()) + return image_embeds.split(sizes) def _process_video_input( self, @@ -998,9 +1000,11 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, # Split concatenated embeddings for each video item. merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size + # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync + sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) // + (merge_size * merge_size)).tolist() - return video_embeds.split(sizes.tolist()) + return video_embeds.split(sizes) def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: mm_input_by_modality = {} From b159c0a67aaafe865c785d289335c3760e01a62f Mon Sep 17 00:00:00 2001 From: Gh0u1L5 Date: Wed, 13 Aug 2025 21:08:23 +0800 Subject: [PATCH 237/932] Fix GGUF loader for Qwen3 MoE. (#22785) Signed-off-by: Gh0u1L5 --- vllm/model_executor/model_loader/gguf_loader.py | 11 +++++++++++ vllm/model_executor/models/qwen3_moe.py | 1 + 2 files changed, 12 insertions(+) diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py index 26af87c1ed..21655b0c69 100644 --- a/vllm/model_executor/model_loader/gguf_loader.py +++ b/vllm/model_executor/model_loader/gguf_loader.py @@ -74,6 +74,17 @@ class GGUFModelLoader(BaseModelLoader): f"model.layers.{idx}.mlp.experts.0.gate_proj.weight" gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = \ f"model.layers.{idx}.mlp.experts.0.up_proj.weight" + if model_type in ("qwen2_moe", "qwen3_moe"): + model_type = model_type.replace("_", "") + # GGUF layer map assumes that we will have a merged expert weights + # so we need to map them manually + for idx in range(config.num_hidden_layers): + gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = \ + f"model.layers.{idx}.mlp.experts.0.down_proj.weight" + gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = \ + f"model.layers.{idx}.mlp.experts.0.gate_proj.weight" + gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = \ + f"model.layers.{idx}.mlp.experts.0.up_proj.weight" arch = None for key, value in gguf.MODEL_ARCH_NAMES.items(): diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 085fc90b47..61b16b6a1d 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -375,6 +375,7 @@ class Qwen3MoeModel(nn.Module): self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, + quant_config=quant_config, prefix=f"{prefix}.embed_tokens") self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, From 20d65aa75548c4ee0c9b69847f177dec085cd358 Mon Sep 17 00:00:00 2001 From: milesial Date: Wed, 13 Aug 2025 06:09:26 -0700 Subject: [PATCH 238/932] [Frontend] Multithreaded async multimodal load_bytes (#22710) Signed-off-by: Alexandre Milesi <30204471+milesial@users.noreply.github.com> Co-authored-by: Alexandre Milesi <30204471+milesial@users.noreply.github.com> --- vllm/envs.py | 7 +++++++ vllm/multimodal/utils.py | 26 ++++++++++++++++++++------ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index e7796aa73d..145ec3495a 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -63,6 +63,7 @@ if TYPE_CHECKING: VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_VIDEO_FETCH_TIMEOUT: int = 30 VLLM_AUDIO_FETCH_TIMEOUT: int = 10 + VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8 VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25 VLLM_VIDEO_LOADER_BACKEND: str = "opencv" VLLM_MM_INPUT_CACHE_GIB: int = 4 @@ -555,6 +556,12 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_AUDIO_FETCH_TIMEOUT": lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")), + # Max number of workers for the thread pool handling + # media bytes loading. Set to 1 to disable parallel processing. + # Default is 8 + "VLLM_MEDIA_LOADING_THREAD_COUNT": + lambda: int(os.getenv("VLLM_MEDIA_LOADING_THREAD_COUNT", "8")), + # Maximum filesize in MB for a single audio file when processing # speech-to-text requests. Files larger than this will be rejected. # Default is 25 MB diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 8dfbc65035..b8266fd350 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,6 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio +import atexit +from concurrent.futures import ThreadPoolExecutor from itertools import groupby from pathlib import Path from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union @@ -33,6 +36,10 @@ else: MultiModalKwargs = Any MultiModalPlaceholderDict = Any +global_thread_pool = ThreadPoolExecutor( + max_workers=envs.VLLM_MEDIA_LOADING_THREAD_COUNT) +atexit.register(global_thread_pool.shutdown) + class MediaConnector: @@ -139,19 +146,26 @@ class MediaConnector: fetch_timeout: Optional[int] = None, ) -> _M: url_spec = urlparse(url) + loop = asyncio.get_running_loop() if url_spec.scheme.startswith("http"): connection = self.connection data = await connection.async_get_bytes(url, timeout=fetch_timeout) - - return media_io.load_bytes(data) + future = loop.run_in_executor(global_thread_pool, + media_io.load_bytes, data) + return await future if url_spec.scheme == "data": - return self._load_data_url(url_spec, media_io) + future = loop.run_in_executor(global_thread_pool, + self._load_data_url, url_spec, + media_io) + return await future if url_spec.scheme == "file": - return self._load_file_url(url_spec, media_io) - + future = loop.run_in_executor(global_thread_pool, + self._load_file_url, url_spec, + media_io) + return await future msg = "The URL must be either a HTTP, data or file URL." raise ValueError(msg) @@ -489,4 +503,4 @@ def fetch_video( "video": video_io_kwargs } media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) - return media_connector.fetch_video(video_url) \ No newline at end of file + return media_connector.fetch_video(video_url) From 19b927e52df8400084df1c8116af7d6f0a5f5d15 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 13 Aug 2025 22:18:07 +0800 Subject: [PATCH 239/932] [Core] Use individual MM items in P0/P1 cache and model runner (#22570) Signed-off-by: DarkLight1337 --- tests/multimodal/test_utils.py | 235 +++++++------------ tests/v1/core/test_kv_cache_utils.py | 48 ++-- tests/v1/core/test_prefix_caching.py | 31 ++- tests/v1/core/test_scheduler.py | 21 +- tests/v1/core/utils.py | 19 +- tests/v1/engine/test_engine_core.py | 2 +- tests/v1/engine/test_engine_core_client.py | 2 +- tests/v1/engine/test_output_processor.py | 10 +- tests/v1/kv_connector/unit/utils.py | 2 +- tests/v1/tpu/worker/test_tpu_model_runner.py | 2 +- tests/v1/worker/test_gpu_input_batch.py | 2 +- tests/v1/worker/test_gpu_model_runner.py | 2 +- vllm/multimodal/inputs.py | 141 +++++++++-- vllm/multimodal/utils.py | 135 ++++++----- vllm/v1/core/sched/output.py | 10 +- vllm/v1/engine/__init__.py | 6 +- vllm/v1/engine/core.py | 7 +- vllm/v1/engine/mm_input_cache.py | 78 +++--- vllm/v1/engine/processor.py | 64 ++--- vllm/v1/request.py | 21 +- vllm/v1/serial_utils.py | 48 ++-- vllm/v1/worker/gpu_input_batch.py | 13 +- vllm/v1/worker/gpu_model_runner.py | 97 ++++---- vllm/v1/worker/tpu_model_runner.py | 39 ++- 24 files changed, 549 insertions(+), 486 deletions(-) diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 3fdf7e33ca..41f4773a11 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -5,7 +5,7 @@ import base64 import mimetypes import os from tempfile import NamedTemporaryFile, TemporaryDirectory -from typing import TYPE_CHECKING, NamedTuple, Optional +from typing import TYPE_CHECKING, NamedTuple import numpy as np import pytest @@ -19,14 +19,12 @@ from vllm.distributed.parallel_state import (init_distributed_environment, initialize_model_parallel) from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import PlaceholderRange -from vllm.multimodal.utils import (MediaConnector, - merge_and_sort_multimodal_metadata, +from vllm.multimodal.utils import (MediaConnector, argsort_mm_positions, run_dp_sharded_vision_model) from vllm.platforms import current_platform from vllm.utils import get_open_port, update_environment_variables if TYPE_CHECKING: - from vllm.multimodal.hasher import MultiModalHashDict from vllm.multimodal.inputs import MultiModalPlaceholderDict # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) @@ -178,19 +176,17 @@ async def test_fetch_video_http(video_url: str, num_frames: int): assert metadata_sync == metadata_async -# Used for the next two tests related to `merge_and_sort_multimodal_metadata`. +# Used for `test_argsort_mm_positions`. class TestCase(NamedTuple): mm_positions: "MultiModalPlaceholderDict" - mm_hashes: Optional["MultiModalHashDict"] - expected_modalities: list[str] - expected_ranges: list[PlaceholderRange] - expected_hashes: Optional[list[str]] + expected_modality_idxs: list[tuple[str, int]] -def test_merge_and_sort_multimodal_metadata(): +def test_argsort_mm_positions(): test_cases = [ - # Single modality should return result as is but flattened + # Single modality + ## Internally sorted TestCase( mm_positions={ "image": [ @@ -198,34 +194,27 @@ def test_merge_and_sort_multimodal_metadata(): PlaceholderRange(offset=3, length=2), ] }, - mm_hashes={"image": ["hash1", "hash2"]}, - expected_modalities=["image", "image"], - expected_ranges=[ - PlaceholderRange(offset=0, length=2), - PlaceholderRange(offset=3, length=2), + expected_modality_idxs=[ + ("image", 0), + ("image", 1), ], - expected_hashes=["hash1", "hash2"], ), - - # Single modality without hashes return None for mm hash. + ## Internally unsorted TestCase( mm_positions={ "image": [ + PlaceholderRange(offset=3, length=2), PlaceholderRange(offset=0, length=2), - PlaceholderRange(offset=2, length=2), ] }, - mm_hashes=None, - expected_modalities=["image", "image"], - expected_ranges=[ - PlaceholderRange(offset=0, length=2), - PlaceholderRange(offset=2, length=2), + expected_modality_idxs=[ + ("image", 1), + ("image", 0), ], - expected_hashes=None, ), - # Multiple modalities with hashes should return sorted modalities - # and flattened ranges and hashes. + # Two modalities + ## Internally sorted TestCase( mm_positions={ "image": [ @@ -237,47 +226,54 @@ def test_merge_and_sort_multimodal_metadata(): PlaceholderRange(offset=2, length=3), ] }, - mm_hashes={ - "image": ["image_hash1", "image_hash2"], - "audio": ["audio_hash1", "audio_hash2"], - }, - expected_modalities=["audio", "audio", "image", "image"], - expected_ranges=[ - PlaceholderRange(offset=0, length=2), - PlaceholderRange(offset=2, length=3), - PlaceholderRange(offset=7, length=4), - PlaceholderRange(offset=11, length=5), - ], - expected_hashes=[ - "audio_hash1", "audio_hash2", "image_hash1", "image_hash2" + expected_modality_idxs=[ + ("audio", 0), + ("audio", 1), + ("image", 0), + ("image", 1), ], ), - - # Multiple modalities without hashes should return sorted modalities - # and flattened ranges and None. + ## Interleaved, internally sorted TestCase( mm_positions={ "image": [ - PlaceholderRange(offset=7, length=4), - PlaceholderRange(offset=11, length=5), + PlaceholderRange(offset=0, length=4), + PlaceholderRange(offset=8, length=2), ], "audio": [ - PlaceholderRange(offset=0, length=2), - PlaceholderRange(offset=2, length=3), + PlaceholderRange(offset=5, length=2), + PlaceholderRange(offset=11, length=4), ] }, - mm_hashes=None, - expected_modalities=["audio", "audio", "image", "image"], - expected_ranges=[ - PlaceholderRange(offset=0, length=2), - PlaceholderRange(offset=2, length=3), - PlaceholderRange(offset=7, length=4), - PlaceholderRange(offset=11, length=5), + expected_modality_idxs=[ + ("image", 0), + ("audio", 0), + ("image", 1), + ("audio", 1), + ], + ), + ## Interleaved, internally unsorted + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=8, length=2), + PlaceholderRange(offset=0, length=4), + ], + "audio": [ + PlaceholderRange(offset=11, length=4), + PlaceholderRange(offset=5, length=2), + ] + }, + expected_modality_idxs=[ + ("image", 1), + ("audio", 1), + ("image", 0), + ("audio", 0), ], - expected_hashes=None, ), # Three modalities + ## Internally sorted TestCase( mm_positions={ "image": [ @@ -293,72 +289,16 @@ def test_merge_and_sort_multimodal_metadata(): PlaceholderRange(offset=12, length=6), ] }, - mm_hashes={ - "image": ["image_hash1", "image_hash2"], - "audio": ["audio_hash1"], - "video": ["video_hash1", "video_hash2", "video_hash3"] - }, - expected_modalities=[ - "audio", "video", "video", "video", "image", "image" - ], - expected_ranges=[ - PlaceholderRange(offset=0, length=2), - PlaceholderRange(offset=3, length=4), - PlaceholderRange(offset=7, length=5), - PlaceholderRange(offset=12, length=6), - PlaceholderRange(offset=15, length=7), - PlaceholderRange(offset=22, length=8), - ], - expected_hashes=[ - "audio_hash1", "video_hash1", "video_hash2", "video_hash3", - "image_hash1", "image_hash2" + expected_modality_idxs=[ + ("audio", 0), + ("video", 0), + ("video", 1), + ("video", 2), + ("image", 0), + ("image", 1), ], ), - ] - - for (mm_positions, mm_hashes, expected_modalities, expected_ranges, - expected_hashes) in test_cases: - modalities, ranges, hashes = merge_and_sort_multimodal_metadata( - mm_positions, mm_hashes) - - assert modalities == expected_modalities - assert ranges == expected_ranges - assert hashes == expected_hashes - - -def test_merge_and_sort_multimodal_metadata_with_interleaving(): - - test_cases = [ - - # R;YGh)Gi0%dA~;Y3!s6?_IG=x1zb%MzeR_D~z*~~la(lAO zdMXBxOk$a>dG=xkl*H31jYX&n6HR)5DWoFkyuZZGbC|WhaE~!mEHXV04zfz3QPCVr z=OXKPdTw!Q8fZt#yIFrj_a39|+m>+hfsIN+-Z-C0(1m8k+E3xGuV zX0N&W404&9)~kaRnGIgF#pGw}r%Of$d;*iDS_bV_Vk>@AitmPfEIZ9NF%gQQ8-*pw zrK)vd@Q@TzX()^E@ev9n@^IG5L=oXLMvWSAE?X(R61D3$tlsdpe676gw?Y<8&QGb1 zrn9OWec@D3Fi0%J6)i5pRB&AOSIOyoF2Bf8+KDELV32W1ZB&X$MG>%LBx4DD%j%ZiGr=IqwiNMJ=T$`>1?+bb;as<&$SF&|Lg(Xo}P#f5E;jHGipuGydvKKP_qCXPmE@?5`4$6__R@zYaXa**%#VjFr472DE0inLg%0+ydG8({b)J+mxjg#62<_(B)fgWP zT!mfj&nW52qNL@msWK;eN|SR?z7}Q{2`_(yaRZMk?yvPTwV+Rch^IiwO#s z&vUwhb9WoK(b-RfYEAlR;LQom6YwTR&j%>ee0>CnJkRog@KcRKd`m~o*W$z$CMXr- z^^*BHt9Qx^`-G8F=K14BZ{+6(c31Z7=dKq8i@N^tM2hf;X^#e{tZlD3FM z4B>avdJ!lw@~d2BqD~GdNRV&P15@pIk1*9Bgs<9B?lHb{p`0T!S>Qy=b-(Jf)Arok z@sc{ycnIs}9^{sbaYn%i=W%~QF*V!j@si2K20AYC=o*i(G_bHln1 zY}x=HE|m&>D4HC$lPa(qRq@XI8x}eoYKgdfzlTngYMon%`&6 zC$aRsNR#zqqpN$1iq!8(=`KX15XffmI0S8tqzxRkVAzWlct5A`-fzYATjHq3t$N=E zu+K$M{;1*>PR;VV(ZC?YN=UT-Nut>Drqy+8JZb!BJ|82Dn7@_SMkW78fu46_emK@U zN;q3#dPV`YK;;pvAIFAkB^?;PZeCMZ^Z;K?sEu3Ahbzt9QpsGH>+Su*tQ+q8eW)&o-*9%F?91Ipg2+8PGhagOt`W~_jXE=}2R&x1_|{Wz7^iZ)83$|{|fafO9W zO#!l7MWKOW2|6l-C@&q_IU8MMz1Fl&Ype7ATo*oL`EjG^5Uk$Jjv3FJCIV8Wjyr91 zN@-%Yk~RS0*wpk#T(j(GK@s>?otAoUk^9bcZ>bJbVLMo{`J!C;XqSVZx?!#7LH9y5 zQM)K7Bb+Eb+$?Q3sTyKX`zl@S#(gY9=`?Ynj3$vs=4M)|_0QKy@r+u(jvPE}^cuy_ z43Msus+E1%CD6N~RR;?(1d>QaSE9cKz@{Fv!M9zW5}10usH zyUqO!f?=ZQ3Uyvjr~u31_DFmT=|g*!`kSzY)>Xx&e_R^)=;v^F3inHV4wTc;d)rBR z-}BFGoWO%`zq>z7;OD-(D5_ZB6A;_ZY#({LzFoC;0>haJK?$2G^d$LJwO_k18|aUI z>Tg>PlcYMN=PLA7=A(e+BEx8cI2p+9%lqbuidL4CeY8;dygpedXA)$Uvw3usYqfi# zqu3y8l`Gym&zY&FSM}4C*yHwC#bImKGEC&C2QfEada05z&h@)?m1zteJk|UXe3qCd-7XJGy%6KCX2(3Wo~xg!>xsn@<_i!fAcYPn-Ek@cEqd~ zQHge>shb$K0Cle#oert|Uip(<#`}4-LHW81;0^E;F!AWNQ*bJynW;okmoF}Imt%v; zE_uzTd7Djqze-yvPF5#NcW*2~Q<7LwrCW;`yxy0^V5K>KcR7BLeT&ECv{U=5#*K20 zO-1viv`2En&y{R62Mt!y%uDVYV@={?`!z}(VLP=K@V3Eu$mYQpn59XlNXY0XXil^q zGn{(eC6-~SQG-C|IG)aDao}1U+Xu))UVae`{)wp+Xh>0)F1qYO!Ps_{qup+=!IZw0 ztcgp=h0c9G0Wz4Z3L9G*PM7MWSoSUl70Ea9WRDl;$NI#G;DXZBA+|S>RnCOptCgAZ zlXkLwI?oZ!_iU<$&=bWRhl2P{*uya~7!_Px8v1_cS$pd%DKf5;%WB;N%Kmjg97W^l zW-cveDC(CkwlWgqRurwgMsko>z;8f|_-CeC=87AAtn2ey?opIEbb}WVnE=;bDDluE z;@#indaW`;KS!U1B*GIodwaaIc^R(G9*%xl+nr*eKc2 zz}H+~Wbm~7U!n3?hI1EtXg%9U^RRDmy&?6{k9pJ3f}mGh-A1&Z`pXv$WeyA}U3u1#oLOh*@p^=%Wl&dZj^(s7U~k2;tYBfd}| zg0i_(Y_pYORJ^z~<0uS@AhtA*hH{};Bd3ts-ZL#%T+_CGP0J>jRbwdwWg`B z-_gDKXEpi35=4UNC=x(L0JZ|ml&4_Cj5-1N z>>_*Xrm`9l@dySWioINW$R0T$ZN`jn8lcuTOHO_L>}4|6uzyN`z&*+BZ0RqQd^MV9 z>xV&Y`l&Z%58cpR5e=8v4D4Uw3p%39xyj-vIEl{V>6I3!>Aw>ZSWD~1QcJ@d27avo zID1Jv3DLtE=!D|tu8o7mfr%(r2i>c59<@{!NSoy}IqXKJXoIQ7V8(nmhzq;BJd&>} zFrP1}tM#l5j8_d8l0v|u){hVUZWzc9m)*)^vXCdy0{WQOol8Cin9%(ps3iLl22f_w?q zN|c7w>}9sQ`@L^fo0*%!wqcRi9d?KF6LqN~TEJ_?;5I~;3wgaE{C@I#=0j~0c;8iB zBtDD;R=1d6r?PzRM}h@x(6?}Ne%`TkqKMJk`H^YB(BLsnc&+!)c^p*u!DwQAB3T^C zX|Y=RD&P}5LiCk$Sy<6$A%Nbb9c2*@rYw+%)tbyXF7s$M&9!8$*vJ1`%74|+pN&ZOsl*L;u2;k z(cQYwizNE+!*AeKARfb@!339=CnLfXfiHV0zv{2>+p#@ zV);7*Rln_##JEy-GV^lTW1nY0&3m1K!CAdvS!w^fZ2huq?*M*6=E>0q_`=NO7*@4s z|N4};o|cM2FZuY+u)b##0ouZr%L!Lz@@D5Y7-KBjQ5FQT7qo<>hdv0&-kSrLeUvt# z&moq}FmIgdS5gV--;%oj5ahN2xrc^r>3trnPAVa2+tNy;_rArY(5dth9=tzzM$JIG z55$h$ha`)W;C{3=!~q(X?{-+O2x`9RacZh7xA6!6)mSF#t%N`LsHfECSu{fCOhGz)=6-pdzn%uX@( z_vmj38(tuq2sa@|iqMGCp2HF(KsGyVI-}G|u-CV~s#lc@l?@8w*lqim$7YgAAhh(C zCoPk5^0)RypQWmGl8~C+A4&TMtOOX|wUaJyaXRB?Zec@OyrT5VA^WyWtSQslH^Yt@hpaN*Gcd#1p@Ggm_5V*j+Ks;S?~Q@fk#s% z@*Epa_EElQrym1NNVOm`qx(z>!&wyE&B$w3gdwM2w0l+A9*dArI%rQ& z{j1W2kw%UEsvu4-BiaydZuMb9tr9t2&py{O(R!4Ud4}5KpF^WYx;5_C!PQ@a(J=93v|6dEtcPQ{;v#@XdGKoAK606@ zl!ac+!t2^^{U~&53}~&yxK@UnuRAG9==McnOT8jz_R;bzA&mzd!s->zI_h5 zif+~sY3>B@zui9Ds0?LP${RT7eW{$6Jtn+k@~m`o$n7kCk;V8t6povWK_yGaX5L{% zx!?UouN`h-{!Z=_o?iQ_&zR5sHs&3--z4mTGX7+h5G1R@#GD+J_;(H|Jb-%|ie}0# zaikB6{g?0619^KT;X#i-LJmb?@f# z=cW0HYArpnXyqo}c=zygj?@)w{i!R{`x@aE_E@!5Jp<}55;IJxcIoe8$)ysddH2E4 z_fZ|B+heH}KAN4$e!hGT98%@S%Yv^9BIQH^%)#rL|wsU!#DITSUrzuUkXZfCY2M@d!QQWBEP5kD!9x?Jh zT1Gsg;Pw22(G=E;BZu+%cDg`p7(JkMQ5_i}`hyP7>VPh*tWsyufhK^w>QTOKr^T z5zx~tmf8q?{&c%4Q5Wkbl_`CehpGIc4n)EK;7!t_)XJ%swMf1e1;tOuB}plro~KtA zHmDSumH3sr^}&sxp3kJ;F*xPd6Z5e&#=pa5pnKU?mGaYa0tAmS^+#rR=o2}~1UlUO zlrB^g93Ld2hnN&SQb|Uv-Wa8nqCg^~VG(t(PM36^?+vNx$*DV| z)5N_mgjV?@h_BN5e}9tWFJQfPtE+1x!;$fditG_h)#`i0TsT`ai-dMGwyCQ;#sgV_EXG$%_) zKI3{6wE8H)%0AMvfwBQVz3>!BkWWFMxO4X{2#b~!qpLx6{ecQ2p)))P!2 zb1=Q4gh)lM3niDyO(Bu#_9@fjq{;T9ty&cZr_Xb%cB`X=u5>2v9%dK_EY=0`?PeVZ zjH8U#^}YyK5!Mg_edqxQulz3fWt^73at>J=18(PLJ$1N8jC1tffD&y1GF;?6I@uQn zQ(6C5Tqh_ein%YxWOI^Bw~y6cXTtHPb^R}6`h3QGHyxTon6JFh*#mXgx^a21uGz(h z<)?2=gav7ns7&cDJQU7^vk)OEMgr~vB`+qg<}WkH?_b_amZ(=B6=Juk@hs|lJ=ZZD zP)zd25r)eGA@J)C>$>P*e)?8K?eY9xln)%jN(@?dgvv{J4~ z>2SGhwx_px$}y~Bg%)*0s;5@>9`s-J^~#A)Ld}JFpHqxaWO3^a2g@RCfC{#Y_ za(l=r@)AfXl7H)ZhUC^J`wyAp7Yu(^zHdOeB>V?guMI%y$Kc?GiR{vm%J$0qxxBy2 zDO(xe#L$ZQGq^uak}nZ@=g%Fnm>J;dU@D|}WRW7f4lhm8b5|~^9hD3!P$6fx=|ANc zYA!slYe3kCXJK&tawuR-in2#R`!jkJt6{&q2Dt9EYWPU+w>% z2on_NXv$l^KV&g+HXLW^8yh^~K6P|jEJ}HJlVV`mlLa!bJh^5{eHrHWWBi7RQg`C~g8Y;I|YxT*nhx_r$*$FWVMR*?4F$@7o&OXiJq6~E+4zZ(b?(mOo^Q@GrwLZoqG(d~AGwb%#Rv+b-2pC* zaRm%w7e5R{5@>lpoWmA_y!P}O zoUVJ(t*I;kdB8&2&H4GD0*hBaQj`02YU|^q)MT+2Pq9KS5VXX#A2RApVaZGgu855*qmA&wEFhpYW+mlyD%EUKv5 zr!(Q94BB9>n;#e3)mw-8iWQa`xUAN#y!S?)X7ZI4AFP%*i-!A%SPx}@clUET!Fj?o zAtYmHI@}@!c!7!{r=!%WU6rDqcwpNLBd_ONjCliIIg5}3oWGLL1^(>e zS(+GUF_>{%XhcNMl$j3X<%mZsdbwG=a6VozF1px~z{qp`p0lGlAe$cD#ogOdg6#Ed zevo9m3cu{~)c-~zo%1`f&j2@GjK~8yseGc2Iqq(Lj{N0XD(zQvo99p6@DWZNl+PiA}FTYPDF)4m>G3?ThIN zmeS_K#tNrZt4y4+X!qJ%7@OUye?jvV%~_iDF&ME+vuTnsN}Ju}`GVqp>Df^0dS-_f z5_)?-&=))VXIxqcH-x9ID1lZ#{vDK|Jun+!xHc=d;|pcT>o=<_b*e-Na=lXAUu-*Z zQe`1Kt<*pHZ5x2NCW3SVAztQfvRD*B9eWcJF+}3Hh{Zn5zA7i^n>Xse6-8cR!I55A zy2eis;KIbH^e9Bo)^wlkcf61|Y?fnP?KZj7Q5ki4o4kMpRzzjE1W>A0@xJ7~58#I% zM@&fRPzXG!514_Qpzd*_Q;c&y^ZMzxj1SyF0AOBkBjO-*PykgB7G2F-n!i?&r;PUL zBgiA60d=zMkgnac=z6-`W`dYX;b1&p99K6tB6}U2tZ}PkNa*)7CH=Z1j!|T>UIBF4 z2romJ#QsZOIzgpNJLH^vj7R~Eb^*@`)Q7ubB-)=UjPv3KUSzg;Q0OewLS%)k3|j0@ z4!H{d|H&j^%A~fPbW5(0KS-1~kicJowp_wueL63D?U)8I)>T z$Iw>Uj`1CyC~T}9Lzo_m2*05lR^2&eiI@y^>f};L)pL~LmBXOkfDVIyK~c&C9BN4D zaX?}yS`i#?AZSwNaG3Aa<-^{(2@k7Pyk|9W69|>QbV(Zqg@EMA#~4x?I2}$W?)*p{ z@+jAo0zseKQ=xH2k5su7QdpXlpJaykR@CWNgVhZMYwC{E-|uezs(0YD{_lNkq(lYj zbiGJO4g};oL|l${RUl38A%^8)d?GiNT_#m5EPx6Y)(>Ql4VBU;!H606*za2a8$qtz zUXT!^AN6!FD!OG2J=~%_zt|oDfdm`b8w7(X*9BgWo}+fut=mAR94QmsK*e6?mACne z?e$UAGDu`mm=G(|o8nE;M55j9M`D3pd9OO8ZxTp#xIo7YfTr-%w9<3mv3d+Pkr^|0&j~Dc3A3 zAB-$VckvGpCuS+e>aYjsfn?hM9q$PSGHf(Z{uI#=VS*d3kwa~69G3hhb%I~eNa^0T z>8v>7Gt-aR@#uzX{U>7yzFvOFuZscgEv6qfhS+j0?fXq+>OpCceEWAjxse#v$;OC* zVQ~4cUe2hYVktu?Xfh)o!G}UH?3Didp3ue!*?d5a`l}~jA9V4UEqDo>m%mgWWP)G# zG4UUrLcaZ|63-(AX*+Jn|30i>6&i>xv68%`tmB_1^KoT-(ct=<(IecDtIYQ~hOYA~ z1n-kR2w8SWc!*$ZNM(|_zsbaI`%7t>&?j|pOfuzgbaSwt_@PIh51GodpD)>dQssz{Gi;){NW^3M&4!9~ppqd*K+n>u` z#{ivmPlhEa2x8Q-BnK{x=_h zatvP?^|K%e3*vt>;a@qZ1R$F>0(rO<=HHp%Z+zGb1{$c}_oETk{~Y8slbHL}>?MeI zd-MD0fq`2y9(a{d^Ic=55?NFgG1;$Kn;#L$CJ z`_M1aI7*^_S%l6|_1x3kDo>zx)_kn zP3W(Ao+S!CK_|7n8MXz@LqXb?A0rosDfhom{9jb}Oa2HlHIo%%M`3?UE)ixB(f|Km z{>DIm(*OT)x_HM%1;C;VE!`tp7HibHh8`_9jMo!Psa0sVy&s6m3fZ5JsD51T@X@1` zOSw=#jBp@`tR8j8{oav3Fk-~24h65>>?m`#mec%PLtGodASO%pS@=4SNt*Fiw+EC9AEe^obEiK?VV#xk-=WCHvdU&#nB~X~% z=5eA=;Utm{B9k+hkWPz5Y%D{7KqJX9WVZuj!03L~0sn@}CY$hNwPnQ3Mc(qI^%*E; z4y&N!9isp@TcOA1bae4{*2{OWQC2#%C(S#l((QU$?vjn)rhBvLYSrgus9dW-8|F5C z7WLTUWU)h!L9hLY1(}c-+yerD9^Nc;d4+BYKP484|J_RLIt)bMgG0Bl z)z!&Onl@fSW(bq<%sB3AJjs4bt~}Er#aM}?@Fed|)cFIOjyQiC%mfa}_;@-l`=M+S zB6#ko8y3LAhasgiCk)D(AUCnBgx#@>8tC(FU8my(HX7~LFRk$`32Qyk6d(cYz@}>TZS)~paqtvXRKmOxl?Z1CXgkql zlb1u2#*`~0k|?o|?aV=&>; z)1cJwYE!bYwughHe)fvKK#>hfAS&zEK;X=7|RQ;?8&+A@oB&&78J`sz*#smZN?0u}cKjq`Qn9g^*!oetMW$T+FaV}|684Gnr<2N0E;jQuQDT%Bhs3(nT z{Rz_j`NFKL)2YSUT$w|k_$iZd@s`MpGUAPqwi2yz-PpVRLaSX(+FxF8&R<#$GLCOo zT7HxfYs1ihtGCPGunk(BWJx@h(Jh`ee6XJQ^`)iq_I~}Tc{r26r-^So3z>VjE3MY6 z!7%iL&7$bT8$2pGf$Ip4!aJ*EjDZ(sfb;zYbcYNX2G%9KNFHHiTD9OO>2%!7brC~k z?6Et=cAa)k^v_pOMZDK7=}K+~S7!Iw+aC4&Q8O4sbnDqz1stNv9Bv}Re2OzKH8PzzubPgI$E6V4fM&p z;0A@lgJRkfM^l(}kWUxMR?9U>(!8HFnk@u=YUtzpw;Q$dx}~Go@xZ}DrL&yF)-74Q z7~rJ{m?W;got-J2%`0UG9nFiqBtQSx&~&r8#?aB^x2dY?>RlI5ScFv!LY!X)+@jyl zm=6YJdB?(MLCdaxaHtr{5?2>R}i;$1rHNn6$5doOMhu1Xa0gq;a% zBG78w#{r6X@3G(7W4<^%?MST1@9N0TvR0{?(YJx~xkrOix{S-b7GCG%--tzSjZzXZghBuSoZ!S)x*TZ+ky zgCO4PUzlv z^n-KJt?BsP|1EiP#knRqFzfcW}yfoKzu!#1o{L7ltaf;Nsn%QA9v^$tERPp0%X-@Y%b zq|8-vGEbIX3Y$Lu{4+nZfoGH254}cttOK?QE~iA$`$I!gjBTbHavARv)H_VFynb;} zHw1)qntT>e49^O+-?@?~)~jnQQ|F%DYL_^!u6A8-)WGbht>aye69^MtKLclZ-hq77%E;>6Kp>1V`y;5ck9zHOO9sY|W*_=nk#$=``?v1=)8 zdbwYKDtc*`rAb4KxdI1dvlYQN1E{ccY?MCS+R@(lFO5DXTuu){L?goig@**d-RLHm zZkZR$GK~%|U?S4NgzC=tIP~2X8kA@q9)e}eN6v(nEftz*S+$6ab`Duv#% zN>A|@wKOU?>q+B~WJTebUxc%y@jN*=JhU(KAYJB)Aur?v0B8;t5@k++rl6Mu*Z^Ou z)xd(K5=v%uT&6e6Zhvzmp(UP|JHs<0{Vbnl=M~c8Zpy=zo-ni-ghEe^4?f8xT#+*QXuB!I0wy}03eoIxU=bwia>b&Dyj0H$hVwrr`EtbCB zsMg5=jw>{vD_ zEVKDXosvBSF_!oSDd{;k6 z7PYvXYIH%ZM?PXa{OpxMl`PRM(KesX_N$%1zWkx(oiOjPBk|z4zcuy!GkLK${e@_x zsb{qGLynCE=q0Ql;V+gUMn)NU|(dM$m zeKwmKH|CSt88voWTN4_}q)+2L?+qAmzB!s76+@)P$-Wcxkj&!Ga6Fh?_AC=>@nsW; zk3zLMRjHahnQy7Q+Mn;aNmvp=y*a(wXN`Rf#5Ot;6aJdhX5I4%qEuYc7K2)&Agon) zOGYA@DW>*hPRo7=xo!N#Vb52=7%&_w+>Cg6j-}M(6WBSN{B_f1Ea3bFa}_Fz(d4_FQFO{J~;IVpzF^L~r~>vi|A1 zR4q_HL%?a0O0i+cV={Xl_9Eatme>K4kyL*%g!S08gyJ`I0<-!0(xa6o8TWcCYv7vM zb{E2_`Mq{EHWCUxqtZHk4Iz~*K9|>lUX$Z~YmXTYrT6i|+ZvmVJ(7#tU#oc*tLyHv zL~-pYO8se{anM)FoiJ*U`NzrKWShNzs@w8IAz)Ky*zfx2a9`n*zTopyuhK=}Ta-m; z)JXhj{x)lUrHfa5rfTINBL^^sXu5+9#J#}CY1fZg!FN+`Di!vdt$-&$IHfsZHF=Jk zSE4L&I$WLnHb6|!rTl*=d&l5NyLWFp$;7rbu`$uan%K$2wryJz&BV5CcC3lbj%}My z-+R}ypI!fY?+b ztX++zKdM@XqMa_lVX`q;df^0r{{X$}^mjkZqm;WU#qd&!e<8DGeTqpY9rC&9`1D4x z?YwVE5n!&qYW&41C|6F)s40|@z%Nma$V&>e_*tY-lKidGp1KU!Sj28Ur@2@bH#q2A zoTlP~WUo8CfIiAqSmv{@HG38utEK)Z1vIlqz-$9WZhv3crc3_%X;~z*8AmCaY;({B z231$#b#%hB_`z19Ff!bUV+tqtrWbt=Kp7yWfvfIX2)hhP&x>?O^x~Xy0QZ;SuNgFpIE*IZ_mYhz7G&Z$ooDkU!KHh9y$nW?&3>^2tqgOL9CpN~|6dnC z$nMGershG}H&l}7ehdZQ`Zv#8Y1RdzT#^W$*LM_a(~qFggLSxT&xED#TFmyF?dJ}j zxWvL|hY(L^QOq&fdDhgIm&V9{oXCgJJX$$lJ+Tugq;^th2Wo4QD%3|pu6GNylBUCd z1in2NI@F<~6yd;E@U@BZvPj)g*0^>&IB6vnKH*>OMCq5JvxX)rg$D5ka_oXo(NRz= z-4>ex>gJFuS>sA6mEIiL7;3S~ncB0!;p;8EW_}3Ys|`ALO&7btSL_W|_oV?=mw-G# zh?FY`#*w=Mh5V&rtQPjlO3*!d)(La?u2ll-8P#JmZ_ZH!B~#>yY&@51EfZew*Y;KU z)N(zoo0$Lc&(F?0 z7nM*G>!ydGErfN15&hp&)bgk<&~&OaHPYknR=rKjG09eB>o?k40?IB;yN1}72PzetkC9BG}8-cDof_2XfYYoUClFlM49prkcc^_h=m@H1Q%=XxgDD zme+(b1__5=4&`OB9GzBQ@vXj)8(^@Aqui{_p~@cS_rt5e!_xA>4eHr#pUiURAh54y z^9t@}&3UBIWJUVs^&n?21xZk{nlDdU_Vu(?>yq1~(PR>Rl|&}<2@h|!qo7e>QzD5C z&U7VTbGB`hg;u|;n44dF@@?pwIdGA2b;+yM{_q4p5?#P__YbYLJg~juYVl(o_!&>w zb}JRwH#lBBzT&*B_n^kCO*6&+8Fcq<0XR;ux=>ec-}&MP zm1G^<@=IGuaQ54C)MtW?=S|x88*B!$k%Vx8764x|4UD+Ts9Ut{g{&l{QOO1wGJD~H2PYkt}K~jxVt~{q=0XC zIF*wuC+KIs6Y6@JzAsa#k`q@blB8H<9~Qk*=-DZaqT4bxY&>!G8l8SPL9i&D@CXE+ zd@Sm79fvUHsog9_c7?S}Ht6s4(+fbjobhYWsrdp4N0Z5G=r#?u-*PAyEnK za$7aOTbJMA^9|KXKuYC!3wya)d?j%?F9;?ZBzuXy&j*ygs~obp6~wc>IYp)A3QGQGnET#T=yr+pj<2JL6_IQZffp9UEdv@I*>=Ed zVr!NE?zz%158VH1r}hAvyy-NF!@Z}!d-Xyzi-Ld(+9%u1HWlek%~_4_eOjRTqN;Mf z2Wy#P9^f3m*3ZRt96K*6socsmj2R=96<)e`!K&!H@d+kQ9+7)6%w!-afMms(pgdNC?V2~f22EG%E zZ#>V#daw<2HoVyMn2m>PKuw4(eHv4>j3@}7sU)yfiYR@pi6 z6;SUI8M+x?h1*BNj7&CJfD8Y4U3z`xn;c7g?!Bcp?+T(S_y4Jc$$X}n&9)RQfSt(s zM6}#6M{Sy)B%q&CG}cY2@53V!TZ*-EY7d|l+hj0j2tl1nH;o#1xS0_ZO{!XM(Vv5o zsW7YJ$+%CyC-y*g-gNr!RAnwxiel!+^Dy%_(;b?EWxm3$ z0a7meO_pN_XZK8$icg>kGXRvS-&tGp>{NH_&p#SWQTfZ$G|%sKlpDL+TG_wlU%L?e z-r)v!!U#Hhp=TRf3`T?>-cY;@9ESWh%#sEB!?(vJ@u{Mvr^&flS=_g|_7~P3^u&kz z+h6kdoDUgocoUVYP&*z;41rz0w`jA#-w{CEFp)^^xhl^YJ~8$<4#IV_Q_0Yuw# zINZELoOQKmM^u-l+D)gPPLe039KlMPaRv9>f~#Sk6B#ViWO(|Q|$(K+W>Fi*kb zoYg#pqdGK|5aP#S`KD>kKJZ1U*)aHJRwJBN8#^TFzVqcie{cb2N1TJH)Bw?~(l5=m zx+%s_>&HNbKpK`kP7}6|L^z%Oi)TZ7&$Euifo^lmRefLUs5Lc_65QtJKAkG-2jO;k zh5Q8mP{fWW5iK;{H=ES!1dYyBZ8aC2_yEwhnN-qN8Vl6Qz4ekUt5NB7(wMppP@|Vc zVXOtBxoUg0IuW|ou2ByRjToZ)=Vuu0TCQXr4q#f-aRNj?S3Y0x_Ws@j`8-lL>nWpL zzdPBfezXagk?4Dza=UuaCMIu|sT$TFCZzFU(Q8ova#7BsO8PU%RCEU2Hz%tYyUuu` z28mvCBJM+O4Re}%A)J)W<${Z1DZ^t4m5Zt$7ZDacdz%~WQL=CR)b+o8z0UE;ZR7x6 z@2sBiH*d2I=w(kMd;ZwhZnAJ`#{9_1LV$Q`4yLT7R2Yagt>R(OU1%dP%rbiYK4U0V z?h2PBBz=Y2)7haJ^%&!Xk5Q?y7)BZ(l@&|m&yorK%0SZgRBE(-2){RWYH$xm!gEFb z3%QW%vHVX?GzXECPf|9wH7AoX z-X6A~2Fr@fnupTTeoOUn+4$LEqse$;KTVMT z$FT$v_}wq1;(gzB=WZ-%knBJp-p491?W>HNR4UsaB^PpAwe~f~p~`i~Oo362q= zc2|$SXv*f-+>+BBt*`DPSP`ExXQ)Tl#9Myj5OND{>y;N^oQ_nobU$`~JcQ3U`*1iH91A4VFBQgd}u)QHvsHZ-a;cI?ZgGhd; zSI)`NczSFdABzO39P>z%_uFh0Q5)Yu;efrq!Ki}+X!EyV806`%L`4h!i|8EslLt4r zm@(OqS@utP+I3bjTK@d@i@KV%9>3`8J`EZa*pM~1H@cUth*u6Dv=E`TAm*U%&yR4qyug&YO&8ucEfVqKH}|^; zpDRP{ETxvJv<1Yhu+f+O8APImG&jaTAec8v)}o(%ie`O?U(wKSW$ObS#4M0$BFGII zaOvUY%^!BF0Fl}T5IwBsvGJ>^)M>u^8g+X265UH7t_ZIsfSOXH@zvog*eO=&*bFC2 zWMZemGLnE%FWFkT!o`KEmv>6*%MQ2InZ~Qde9SO@o8N7+5>=l9DkO%SW{@FUEo?E_ z3>i8==5&8S9irvuHAG+%R@m>0wJz0P$BI8;FK&5V`_K?EVi^KM=Nv)HE6k>J82frn zvB8DPpCbxk_mmG8%S=-^D5{VkLc8}V-TFq)8{=>wNyms^h~N ztuc3+Zn%1X%KBFJxk)6le%QAvfTvf}l_!2_{U3D~$zQ)8F8& z{`uJM&&0iQ$tc-2XWpM^*QKV(tk%fSVpzVC)EDNch?>K`b+yQjkHrFKYt0K92z^8- zCZMI=XjRg%G||yxhx?rQ$4wbpy(F$Nbu9D261crP`-OTa<&P?@*|sG`XUK#B0}t8( zg2}QuGZuko#b!lh;fISiAVcSKi|^^B)Abv7T7yq#vaW^xZOvwjs1x5%saP;8!e2Co?O;sRjAtOzR1veNeH?Pv0u+$)=x3NedY8hi5#E*L$cUGw9$0E z@ve94E~lvQ&i?uY(0wdc2y;)uj(%>Do>(I8!=%4nlFqd@rA^8^&iiL3gm)znWCo14<)v39pZS{A#sEEl1mEZES%~_Or!;F3lzhz~ z5fBi@4|^s%%TqgW_{FcP%hod(;(J&p3T!?YyEaegR-_-=Ew1YARv>E19+V)pzfjFl zc>39Fw{mvXe;%k@Jz{f zXtc}PV@Rkfo=i0CcRQwjeJZ6aqAp`;38eNw+|ZbYy%|VeEXTd3-3$#-UGch8*{JE; z?+h{Q6hOOkxxyYA-()(eTCLV=>6}jL`@hjubm@@}j)l=_+QR5oQurTZ1&Y{DRd<`d zHgB5lUpT?vp)!fd+aP}h2z)&ABpK?n4)VsmI6oK}>sbDPciJ;YviFjOD-#&08QiAx#f&K3*pF$_A5%QVO( zvKeL@IaQHNhBkX^Tm!XMG`MwweErn=x1(2qQl#<*RN5*?s{Ggo-TrUEHKj|lB`TCa z@yIHDaqFehuT5E*IekA{O|EyuH3NQ%zQ)06R_1XwSue2khg)~)s#ks;sdnP>Tf?#Alx#Z-(oRLPgB-(Ke3{+L>Em#h8xrf7$_$N^Llt zJgGhAvzw~DIVi8S8SXdo+<{}3;m9`G>Sm4#&b zF#MLzZw0XdGP_Yh+HDz&$29sHx;ttqo)0NL`Dm$M)t)SLW+^hKJ*e)L2U|@WQLE=1zPGE?^ z<&dT->w^w@SzsD>*YR*&QppMi26;A^OQl?_7dbVZP|2_144rU*cB~EF2Bzh?PtrOY z)%J}PY?(``b1J^obja5#H<}!cADu6VVE3mo_7*CqhQR*fe~d_`@V!X2 zz^eY?F<806PlavnIuWiQ#tEHynjbnAQ&z-)q7ub|#M9$Z>vebXRPo0i^G{ix#!`Fa zZL#j*t(j&&ktyMV{oN1Uaho>S-`v{rc${|Wo`2k9>B)cw4ZBmD`vn4dAbVc89CDCP z4^PeJ@UCjztQh_hgK>Via;a;{h8e&01E1M!xMhy7k=f&*yzDoK92-Dy31OYR9is*oe;Hv#i%I8Ox!wC zU6eYj*YWh68Q^t9*nQjlPA~59i1_Nek*FWnTVSFg*r{+-IfuHYF7C&vlH~F-PRun= zgGq24{WqdTF6CpkQr^?IJA+)+n%zM@z28l$YRcRFf~7yh5zG#6)#cpWjC#1a<5aKH z1xncFzELgMY?X6zBm^y3cD-(Aw1SOYbS()yHl<2O)NiF#jpwA(X{`||o>P~K+wY7d z%a(T@0-Yo_w-AuGxKT?cm^aL@ME`wLe^90Slq75I;y2x)!2--~Sg7;ik+&X#F?$GK zFDlq81J1XNyEz8zI%C1@lcr_Jx|;As?&{>rQiZc9tHAG);T+oJWHLz%c?pfNwOKo~ zHTSCFjz@nDE$6y|xnabyuA9T@G)9fyR|U)kNA$_*joesM&d=D znn?DezrWYbBgilncgGCry%xpTACb*(ry`8EOs6d0R->uIAk*-`TKrcCk!#D8?E;rjygO7N81AY_(#rbvn&Vrg{ zvn9;O3nPWu^q5uvkxQ-=MSJ{&P}2y>+HoP7{FEn}x)yG0?Do^;dxNic?G#$&Dr>*9 zG|__*l5MkK>^-QpZ*xNx8NuIwJvrf-T-tB`{I9fbnkv0xAq8qb8b~I?X?;}0^=Ct6 z)9vwM!}*Fe{FTqFu*t0co{Crl|6;Rq%yfE>xr?F1wRoA*xn!)e=(Mrhb?gbNA*`P# z2bP}jDckvB#v=6gz$=w+FiEGwPt`~e4QP0S_FAi6qr3Q_%@;1kc^0SN<)q}R&5vbA zavA%;W?v>K@(OtOek--n`t0<@L_U38h-dXJu0(!4Zswv4{-aWc0eQKJ^ zp9g}L>JA&2JRK%{s728~A?HG$J8-H72Funik<7MVT#u96J?m>K)Oc7=oo&w9+Hbfm z7W$j4yJn=qm^4cG8yjmnP>Ako``g|!gA0&#x;-aM@m9hgi!Sm$qj-?4kFN$&PV=*S zP!8Fw(C{}pS9I~Vbk<-PAHiDnfh|gfbL}Vw%bgpz#Lw^afHxz3P!`(s6lVL@!Bzgp z1NqbwcX!R3`%R%Eb(lah{q?+pv+G8|nS_?Y2@Ls@j(fIBFVMp==lk_#cRY_OUCJY` z{jO2#7AdNyeQ$5?m?_gtS|0_}S+ajslw>8j^pBG`oz7oKeHi^TjqIuuXULvXtg^wOF=8c}>ZPEovvDL3>k- zipb$5q_NR11Bu^#%7ALwO4(}EgR9|gJ5gk4htJB%i>^A1zMah#9%qj#kroQUr`52z z35W8z%VSqsrOedo+hn_Xp_3;Cei6<|0v9MY5Q_*U%xyhUW$$COlY_n+IAUaz+&&K7 zZ;ftPYW-MGhu*!D?xeFPOr1}t1RKK|^iKBGhw^E-**62f2xMOIew{yRpI$r}bPBo%K^}Fxvv!3my1d4*v@k#mzfW&iXO0%*JKJyW1?|o9T5*}0O zTDXwW*w#UOfpNS*i606HpcESWL5YQz9Et7l#IyFYr0;ZkAmiJ}Kvb%4HY*Tsc z#%+$)k%t#XXhmpYT!aw(<^}OyZ`ndFnp7|RuvEVx-vU6mzZUGDGTFP}Z>H$~4@&5v zhUhJny@Nyi^53|?6A~zi|6WX!ii(bUnU#!&{jZt*-$IK2VCJP!Icfiz{Rh7U$-$5K z>BIhAA&~oz-CYq!kWJxVS@jLjab~>L5T*Y06~b~L#2}Z+&n)|2g$Dp?klp>U5;y)| zUlEiJx<%HU0yBX*u2+{U?0# zKW~QG`j6^it=@R_AK38!^OscXA6a}=t0(KfpN0;IoPoAmVcq}t16BRQ->9tt{@cy| zAD{HU{)J=$-G(nMXQ@vAdY}lke^?-TJH66;a%_)Qh1fdFvASO=S(BOkq21Vb{~WrA=XzKiv)cb#cA1%!T=ho1}^#n$$C=Kb|#vbUPmZ-Xzhl7sI1$3A{u9oi!b^ z-Bu+2|I2*fWg~?~rTk7&E{-2dVFt!-f3R8nl`s=bPR`}$uvz;tYd%T#&&;)K*8I2S zTz5Tk#(o`0UT&7<6;CBfHIh^lU^12@fql}jsCU4uLxWdul0Zhe0is8oW!A%DN`E%O z47d^m3bjoE(cdEI>D4OdQ4CZ%V@*Nk@SEo5TUP?kBZNb-nyIMD01vqH+0GK76)ZgT z^{K#V731a35w0Q^IV2kpjtADGo{DO9Ldug(0EDenS3$ zdVj2Gx$sKWb#ph1>j>_wXg9YvZ%{O*=`MWwB(fSFEPG)j?1rCjw+c}tp*z2o-^565LLyUtY>QSZ zb>iEu*JfjPaY;~o%4q}8O04WbdAhp~<#(FPA4Nu{^S+Vf@xXphMzc*GcyKtR@8B?reNJ>CHoR3U6J z>w)Nr$Aa(vCuAQ{V61M(tpDeS#H+xjTuC=KNZRa{+kDQeu`2WK@u%47K0WcI?Ql;O zDHv`7o27==%};tH+au^qWl3BvV@}M>ti^U!s3ex8XytM;cjFyKV|Dkb>txG3Ww=KB&*%6AwGc91E^*{ZO+^5`A+Py1r8T{m%(l?3|eU_G=@Q$uEd+O z9BurZ;eBJ<8<>Nl-@Q_@pG~&aws=2D>O-m4>53M3c{0Wqw5+GN1l_Owo+kB8zdOlz z3ODM`xDwcq7?x`Wdm=_n!1w#3)lkwSTAfnVBvAC1N-KqF@Z+Sq`ynW)C;QD&xPcsm zeKcB(yyjyCF;bFNZi(DNqWe_xkrMWi6CX8|FId7|wO|2nCU46btei=yQ=ZT1g^($t4GJ~i;@{{FHhOq=2BjjX2k`=j|B_%2qrHvVc% zK!4S1zy5**X)8M*!L8kFDf6I;PhX!ykg&1mUOf65hsPCIYx`wcL=1@9Xj8sW&KS4s zgye70CG3srDdK$irXV;Is_&Nvq8F0gF4u%X#TK|`h-x%wUygQ@*nO@HMnU+*oypA} zyU*XP2&uQ&t<>*S#1asQcmhFGa@it>Q=b%1Qy@q62BNL|!5R0@A}=J(GSN=e8rAT{ z+Y>@|YIsaqs%@`_6Gq#Cp1^hq!g>7!vx&||eYuX*+YP=E1j83(T-<%z0ubkx2$)e5 zwlJ2{YD@&2a#)=$TwL=TQ`Uh5i#u_xy{nm?B#nX$$y>bQ9gtMg*D&BVTv9oljQ7V2 z3Ceu$?Cb8EzCXk9q^JKBNbJ8%v`|~#zPQfkTdH_+ej*d|$j0Ea=Pr==|DaVaWSOjw zxsK}eF0(F?zY`gXh74=cNt^u{J3JwE2_WCDC0lRv^~Ps4fqnFyQj8BLDR$Z){j~}5 zr|x_0kD3Mr)QLQ+(}J@0iJWrybh`PF=dUO9^vs<~kO4y9?{CdAYj06J!VTVe-|G0= zOjev`yG1K@Ut$Ox+{Bj8Yt7cRv<89Nz2T(HclV51E%(3TuC3;Q1&mx46{Ji<-=a|- z&VS1r!!(Zt)`;j7UQ<58%?S4nnKOJC>xgILmP~kDQt7|Gf)xk@EDSd zuKxBc$|t1yhZHwq1Ri2B+WS!_TC-fX*Jb1{*Sls|c6BYpknq?8vSre~9eE3@Kt(Hdw!4=4 zZ*DZ^G_~O0{JC{^b^y_(`< zbyk^8n({1O(8&8`_h-Xski*dr;aWWCPrvbbj!ONURp*izw?>B<^wHk}{BmjE1G5dz zvlWlnOJGI^f&=m-h&_KmhpKF5amru6Xf@qflfC&r<%Jj8;xM3otqdjNYTPB{%az0D zQVYS_WS#TdKf#jDX23(*Q5Bdzb+wwy{dnIa4l=NyH}ifr#eZMS*nt*@k4F027-c#p zn4itxu2PlG;qlt5Q)Ui_+9$}O+oWHk@Z>j|#7GXTI_0pSpe4N=PN3S4lqAP~BE)FP z(Etlh8W5~`m+B8kPUgg_ocN0VmT}|muytBv39?211nIqdDdtDQFoixtDIq{CB?{7k z0IPYdjoP!IA%S!*LmJzd>qwC2Y{}o?bN%(zrjTb@m&;C0^?Yw`x!=BIv4^f}?giaL zba%lh0yL{1Bk}yg8kv3ADcj_mg@-Cd`Fu5ipsk;k7D^ZT9e?>h3?X8wJZtvGE!&P` z$iN``%5u1NL$dq>xMM)w1l1wSx^k%yxYy}~l1or1j7FoAOlP}$K_zqNz~)ahJeav+ zdGZ4Yzo)N8-^^#Z-9PwrBN{Cuz6om=!74E1mBs&g#@7A^fz-&!R2sJNPiCQ<6C$SQaac2eqd-) z*!}7C&Wq6rmqjh4yEJg#l9ZSiG}!DzSZ=vz+zOSuBI$+oH4UbcGY!PO{P}s`&K?6L z+CVNRE)+$85QW0$kkc4dk|Xt1OZtJ``~H~Tv&9}5=C0>b90_QJ{}^Tzu7JIT8aN+0 zxA+WvpO0}L2w0nMPyurIj2=1emQ4`YUqPt);tM*WCwVqOV1fKACpY2Fg340@Xg(UH z+gTfI^LLr60yZh#ljo=%A-Sq3erP`Am}Ej*3oQOG;lQ;m;V-7WNWj^^A_9@O5r*#r zxIAum+K`6!&(cX^N`?cqUw@``H?i(gPiIdFrtBFMOm^*K#cmPQ=ydH+b!hkHo*PKa zV@T!vZud+)9C~t#Mwez0z=L2M4}Oqfg5B}{qEKrnIuOJiVv`yrOo|;Xn}tNk7h=Vh zvtH-hVbFWYBM2Hrc1{O+FkbyAjYGtnz2#cb#x!N2e*S$(iVBa<<9xu770FKjWt$Nj z#I<*Ts%B-(qux=tVe zuJ`^Xs1s`%kbhV&Rzt==)l=`Yrl(&^*5UQpmnj9MMu zI5iQDcZPj?v}*jR>H}PS2Ga(&H0q5cb9QNEXf_~yaVS=?&Y*NQau$#F)cCXqJSO9T zVHK_;+j*alibF`Zy?`2o09P>*36IZ@)o=FTwIa365ru=RG&@7GA&m*W{b6Iy{X=s6 zl%t9FxocsWtAxPGcy~VU2N;aO{#D#~WI|a_28GVZTi9IYVfO&X$rMie6}t0KjvL?3 zLdNtiZfL$AzZ-{YBlJ?uYFdA%%-$pdX z0Z?dy`~GqZZq+q<%N6zGWTAri>iZUOP5E6srnBp`MLh1>K(+zni4@x>Vs7=G0MNyy zM|gF%{+I;A7dy8Mu7|pL(z2P|7!M-|$%Rd-VW-r2%_Y##oI-8i+egeR7~<~OuTV#o zmKZ~Qp!R36TKn>NwKIi}x5)N{9W~DO$z=<|sLFoi@~F=EhVY2wK(c3T<1nb34M6uK z24F)Ft)fs2?@5FDLwp5jHSaCmNBMxTGxIa>glq->v^zxG$l?I*G_C)4Lxvy15%nlh zvjwI}OpUK_-6C=Tk|YB%6z80+UQ6lc@lKE-vyqHX&bXcT@c{Twe)A}>&gAnjs~H3$ zKCkXkZ8SR>&lIzR4QZc0%atek)1&pI`agq|xRLeA&roW)1NJ_p$2+ z4;~naRH)W!NQUu6fpugl`694odCgDiYk&@Fn|X1+KvTub8(%>mg|HlzuHdmt?p!q$2) z-4nu(u*|34k6>uA@!-~pz~9Iwc+%H6;P$FvENJC@eqd_B+TAnpbk~a&9f7gLanrJ+ zDbykvUrhVT<$qx+Y$^jXx$UZhuvzf6sJs50*L4NfIdOVl=rA9e%3$A0iw#7kKi}>* zz2(XUZ3=G?>z13q%{YWK?}6&Nlz}8{KeOxluys%yNxnR5!a=u|jaT4)db?BEYw{~h z_XHP@0ah9yLLS_pi6g#voz+EPW&G4J6CG*u5xiB?XbHK?RmGrNX@YII9z|ha%9mEO zgfSGu0Ae8Nwt2-UQ+vj~<$NJM-tlI%ZfrC~;UVSX;FW6QM)*+61QO&F5b{^pZe~siQ zS4~YO;4hSr9WhWM6peru=!kAHfJ{tNd6t&4Ui$dznvm5@1%o>g{s-0osom5ERnW-L3q6s-i}r~n>$`C`BRrb5e6@y6|GBV*S}dDh7c zpt?bB!8l0+2jB*bap*Jxq?2eS4_rCn;{b+pW-||A#cC|=DNS7}{3*)gE42p6?gYB% zT{ygvLNIM>9*3`&-J8Qzd&&mG^_JLIOy&PB-?q~gO(qK;Nb z{0qnUs`JnM>nu@3EvU_+Xvd?rcyHIU?fPCi%(lPaAW{0V8|fLlC4SW%AJzyk4*Kzm zb;~2lBCH(U&4zt7!t{k&0FM_r+Oj;RGK(|5iB`M54GLQeE-uCkjLPn)=)y4MkdJ)Ff-t8+?EN7mf@#-_SNz&yS7T18;Hq)bzsxr{v_lWo?#hEV z`}^Lv3(*`?D1w-*Q!Zn3&;4yE)*^jxtFWB?p&-^91|{ih?S3s(6IQhcJ2!~Q|GZa9!}grz->1hKkL zUW7%^+w0?^;noctF&;M{o?3Q(zLV;TP1 z_8*pBxuhFyh;|1s?A#o`E}`#21JfW%CDo7-R1h2=4E*&Gl{8}ijf8D5_w}NiwlV2& zAhOae&Rq89ERn@&tYm%Sod0Y(al2~RP1=%87!2sfUB9`Ooqm-K`H;Klo6GsQq>naU z#V^M~q~M@!G=W0jwK3oeIqlw$jZL+_JZq7YDZYQs`gOU(9OrzsA-qN7UHtjxz>uH` z7#a!LZw>pr_cvM25A`P?ldZ*O=6IZn7qOR0qb0l30@pr;LA$-taj}AI3qG);E(c{m zaQW3}t~;ZS`VY#~zO`Nt7Os4thjvxo-bljC!vujP6p#(r=t^H#oO-2iy)0fnx9s?y zb2@J9b*uAmxv__qrS1_4N7nj`Q3c-kk#v58iW-!8!kyTh!IJ0yA@&WVlbpZui@$Pm%E(p$4nOSBaah60hiV37 z|MKKq=4?r@&!RLE_=X8Ta0(^;#kuc@0v7c=#I}~}qq7&Ljr(?_7D4c4qGbS@s+kW| z4ORj2EXS`oDz;WL1f5g3h#$rO(e89yf32hU-0ivgA6bMR`6YduSb)Vw*^g_Z`XbyQ zY1o!ceAYFO5f0ard85^{(;AdtENI#{Qy&hK^oY$MH6;zC)7YGp0k9ji@;V22MzBE4 zunbtCQS1VsjY@u(t&wP{Hra#ged+~`N`FlgnL(a$iW~Nwe_dW@@gr@R=B5;*Ufz;t$%MbqYG5+WjQ|3Q) z{fi@_MM8sK-r&Yz85SC*Cmq}f!F_l zz3F`VkHY-9B*9WO72;dcllN+ee>b6IAV4_PV%gS3ytVx^hf*d)4salsp+Q~HD=-Su z_%p3iU#QIf-i`)4b8lDPR$$)<~F9I^HZwleAfG^jLS5i)WC zMB&NIzMJ?G4vI46Kc}vD{!Xbm9ZeZvAX3Y6Scd5qX$UzhvHqt386W(Tz4~{>|20 zOH2Id9H%mx#|`TpfQ^3^O{z@T%*II{LHmA2c|!1Q!zP55?O*3f&T9IrU1ClaD5Q5@ z(W9Eb5`!8}@oIHO{qRrRlIE%6=pSIg3D&IUfuvrzB&kj%M|t5S`51sN6R0P5ZgA)I z!ku36h))KDN07EsTt43p0#N}9X`C^ncOrxqs5J)Qy!OsTGkn9bD40Y(IM|6S%)Q{* zZhd*4J-8j$1}JrY1yt-g95@pI)LWPFl}(EooqW@+L4CrXmLYKt1BSY_4G6EXb{;(5 z9}nIIe_jI=NPPSYjfPyaGlW%UB1!fXB5~x=XO<%|&PQ&0jZTkOH$O{RW)PB|ZXLYo zH0vTTLZBu#EGW6nc0`4iFA(Z{F`wTp*bMNa#BN66F=|^RVhY_o5g_D`$~B7UknlJ~ zD4z7dpTQyES;!PwFGuMMYxYJdsV&I}qpj@?C(tU9dvRS!gP?WN;wj5`hr@?pq5p7Z zGR@!R`39X^tvDb4NusW_A>?tnI{M@cyt)YaR5Xh?DukjkzhB1XVD*<~s zRJUzW`dYIWi%Yk{Dy1vJm7s6zPb= z+-{uDiq}g4iHLlH@w3BBE?vMmx$>7`exM2#D_+Gw6yl7S0ho||456@sz&%Ay*or*z zV_A~&qI5%?P{&~7;zfu9np75s%c$b_e2WTya1)A2cizi zjv;WE@4Xnd1SJB&xlM{yW&_f0*=uc;PI)#4c1f7vy#L*HV!TT}I@MzE7ng$_yVN(q zog4z~dGh%BGr=v+LLx&LOP^o?{x^_A@}o;YG# z{QIR%ju$uXodv|->QK{JFDJKYu)zUK{<-4QYA@chu@Fff>W zBh9w>x|bCprX^K}Hnlrou%yqKZxW0*Wol#-v@4-V*oK_O!O z7KBobYnVM8sQ``g4)*$gT>$=yg$^Olf$3u4KshXey{`ZrK(L*{4!jXFk}+iVv=d*_HCGwZnn^e^Of^^QMZCHlma}JlBW^Pter0q zh6QP@^fx~kevSh9R_f-)H1+e%$rqx}FY0{bFgCuKNlwWQ$+w5|o@fDl^VRms|Ii{d z4+w8pPQ7fbiWwpED#GpM1do)>mcst>< z%2N0=s>Hncp@JY zoSkbL1?1f2N(-bbXbyk{AY*teRlM>bShPrdZeRYBL*x|fz3<1QDD4MnK*CpH3v)NC;l9DHx6Y^dZa3E{ zez?x|m2FYz>&L}E<>K&PW=Q1C!`E{skk^gE`Y&o8uTcVPj}PCYF_xjwLEmsK5rfv5FYrO)D9ysaCXXzu zwWUgUdAA|lZ?T~fPrcb3YvZY^Rnn$BJwY=6hrPEBimO={cZ0iSaF-y#3GNOdcnD5# zcL`2#2~L8$OM*KD_h7+-yGs}t+=sjPww!P8ea?64{&(yCepOUaMJ;AlPj|2B?)Q10 z*VX~iC>M>V($ED9*Y?og?QCR!%2ZZ)+nW0Y>+N%9HyMJPn6bI#K#@?a+Rq_+35|-S zyn4}B<7?VNe&pGo79v&jvDy3Y0+$NFy(Tnd8P?Gu>BOuJd^AEMDV-X06whcxqsCqb zVXXx;LQWiF2>ZK(obptjWx*G-ngq;`{~81}XM=4|euEe5ibPorE)OadwN{yK_g}`- zMNkqJA4;r6OeIkjAT)oqC~o-eASAxvD+1!X+fMI$?<4t0U=3CBQ3HJEBej)uxpx@BX(0lCJAH<&7 z4EBs#1V&t5-jbg`>5VC-TA3x)41=yb0vc`{-f5+n!|~OP+NCw*8qwV zd>b6B!&T*@fa%G;NRODgx41EDp9F$PZke$cxNd*0JRo9IV7_?Zb_>eE;sWBAsg6yz zNlr&)?(^>yDXb^+3h~f4hf98dXYH@WnetW`V0qx5SEYfM&(s%D@zWc}@2ea*W8SQ& za8UF$kXF|>z>=sV7;&TPM7Ydp2{)vU8rpqJEmLjw-o;dH-kr+4SL%(!x8;N9d}qf- zqY-HDcrB*;x(WlNkTx%25^HnS!{gt4wzJsg&|LYhIP>wW@V1GUf(P~l!(Y6y5VM{Ko5E%lcH4kYXuS3sqHo5WYO;>%qY`|)7e zM>EkT21COO6?jr?U4#VZ5++x$d0=iIf68ZHv*q20U8+GQgQXgRj zMe-F9=#8K&V|t}%zt$X2@Y^s^=SX$HRbjTC&xlrE-SW)tCxtQ@o(R#lvo5CJu|M>g z-BR3gFx&$Bf_=`V&BhXo=2}>`sO>`{&O;9ozSstKyqGIGhKo;FJJQnpRFD`CtwVal znCBt%Y4PQ8Cw#D(48(uwwRI=-l`q+cuofo9z3Jh*WjH7HrH|@F1!PE1$6M8HvH8 zZSpx>p=jWj;UaL2U}^FB93giJL{vHFg3S+C!%yuzCsTWbm#txduC%(f9u>C-m%PV-yw4n zrN#J@?|vBw)G{fA2X`K{y;x94tIu}C+^BB*Jx!nn0X#c45C3%fy0_6}Kk3`Gbc(F5 zczDnY>Zs#&M@N@opm^YV@9eE{%sF_g61?zWvH^ALz$ku&V8dk3b?7sNRhW6(@9O22 zMj?ezYG_Bz=<{lYigBC$_-ME^9RdBi1#c*|?&uuvdVM#kQ?f0NV?!ZG%xP&PcM4wn zTKc=a$9J)te32;djF?dZ1$uDQ_d7{3ZV*x8-natSTKCL)du4l`9~7F7QDgpPsv7+a zMp1BX$kD|sLr%Fe!k;<^nF02{eqj^B(36vN_rA0p#t{7c?ewzU5o8PR>v9q9fV@_v z%!AJ;F*!M{uSu_s&1exjsNv{U{suk)|3nuFZ?0v5&Om6M__jtV?3jI`MQ z^lJkXmQf)}(DkcB-e12S!_#%D9^6j<^J@X`LwQ7*)7{&omV8V1Uv2#Ndjr}p7DWwQ zM;M|B()>T>_Dnq}ybr4h5w!oeHp9ZAvl9Pj{BoiKgA*!$fxtn=nHlM zpx#Cs`$^-fUom|=eJuAlH_c!76e-TQ-A_H6WY+hn1K+1!G5u*!{%40(NBEZ2F*3fM$xPDw%T|0+e`-akqmB7rkKya5F$^geWNH5ER+K)qLSubqGDq`uE4lpFY3!NTADl8jtF>{_3g-v9hODXocu< zs{Q3m>?R1P0?~O1k>9QlQ0xySGOb5?k+R+l#ur3iL7F_PvgyBOs_5>uV*nhIIgAW!a_F;taWixq|DIJ7?`L4h)a3`LGM zb)}k*1)x=6rFeffR-W~l)$ieg;Nkpi4CdkS_z@ZlOp7nRci={UqhFFZjzOKv<6|u^ zi__V&{cdRO7FzohRhULDl|^zNM?I32K3-*RQ?#twmVWa)I=rU`1tLh zj~Z05@3mCwwWVQ_iI8$Byc1sjFL*QhlIOX^23X+k=0E+=a zf=|BpC#0GD-aK)Qz0$;hv%tHPUFn{fnp$Lq2|Iv7$nBnws(dl`O!NUap|Lfyj@#aY z5}T)6$|uETceHqf&7$}|B=YT!Eq{q|qV=n06s(fvMCacPcIP()22E%_1fjrr8^r|$ z-w($5oZKGE*(2eZvt{2P6wcsWqE}|QueO=KvmVt0`I~?~E`j4r{`^fq;JJ0-tZ|gw zhxa8d%xjjOgsC?I=59h^U!^8MC57%ttcMTqeH*IKju3_~?0=2sJcb}V@?J%gt+^5eQF{M;K z${57Fx#+32DidD{U%G}8SpHnPifYswgF@h@OCAIYKT?6VJ!sj?Mn6X;`8JI(jois6 zPoinpyj|a*eJOvL@vc!^5|7b~ssIZyz;S<9(bM7TyVnJ={P_mXj$DBQPVtC;S3Kq$F0PX1>MV$0Fh(sQCji#pR3cy|0~?ez}y` z@yein3rB^O7rl=IfY>t2-dC{p0G;?5_CyDn(|iejXv9Z1cY~Hev;Dpi;$T1_^u&th z0E z_3B*f>c+xnd#~Zq(IupL)#(56e2!zCsmq1hLDF0t+hse$^Hfgff|$Dqu&1X}dPU1V zp^8q^3$DLnL*7M`@Z@0FXiRip?TjPnYuo**^BnIISUPb212Uc5Ue`F}Q`Ik9dsnJ0 zADj!+-$C2Y{C>G7=k1Z#@fVosI*je>aZO++HL&0NS@IVk3`9X%x8Jp2wptCS4C+@? zmZn_7*_*3pdbm3Ft2;hh{&1VV`K#Fk$$GIpsIV>gr1GDT>F5wImu{!W(cH~TTJH^% zn#wQ}pVl8ydku;td%sTdx$c&UeziP4llLZF)y%xb*gK*BGMU^W67Nunw!7HwLZDqO zBjbGqE(Xl$6W1L1`y+!6dsARtGxd*aak=Z5%}Jt&*yFw;d?o3Pw)qiwjN^Hp$Ep<> z{DDaA!yjFJ+J~85M-~GQd~qDdjh4gDK_@!nm*m3sq>7nB;V~2^^8c)F4P?T(`F*yS7r9xuqbX6U$#Yl$qu;^_HPsMvh3+0dU$836H zQ(9_JxNK=t=GR;f(`ovHiNlydOO^U98n~i#iiB*(wSYr^!qIl(`zVXiG}RFeF!Ot* zu9hU9s3G#@?{A9;3<|nKP*o?oLrmX-Y=3LmYEX%^W_`R1H<$g{dSf~9NXB$IUH1w;ewwv4fORZL9aPX^cjxA(%FKHPo$E*Sf#^2+!ikfBPE>yaCydWE ze1SX`hf_f{1o<*lqYH$&h3SS|GqrLJJGtk&73)}|yp~wh%9cktaP5#`w)4ZKL*7sKI7*dG<<$K{gr+uOF#GT+=4_O*)DD%F35QT_i17WuX~}~~oSiEk z?L_2hoWZl%mhJ;ShKoWx$?LoLJzHMVHOb>j{AmC&7Tp%i6@DXYsUwKvp+9lD<1W$c z!|mB%yG%#C;$dM4$`tgACy-rOvy0>PaD}vwt6+q%)>_d+2tGw}rPbSl>$Si|SwZ#h zdy4%Hc))j~6DE5tQ^g^E{oY0~fj(lnq3&L@B?T4^Owen(5f=39Sa*p#c6eqEzwYT@ z;ZqGCKm?k-BqGiU%x-WbTS1=F316pP2p8~s9r7`uy0OCe3Fa#N=C!XAY;H$7j^Z)( zP#tmNv0d20qd5K@s|c_sGXdH;*j`gi9`l*7MfaDnhYF=+S zJO+K5l^`79)#0*{D14KYBm3nAjtC17D=gk3naN+a~c`!fg zp6?DD(#5eXo7tem)rvRUuo~*y9R;?Pn)HfQf0*ZPVjbKZY{j1N8RJ0c+9r)OM+EJ|5aPk_8Yut8O>VlBPt9V6`dy z@&bk9pK0LrpVMkNmK2A3Vy*yw59 zroY+)7qvTCC@+pSov&;l7mLI&_618MuIY)S(rfdY?xCoqm(6O;x{W6G7MZ{KYVkZ7 z#%#XT%toanhp2{r0d+{|!f&cPrxihIvfX+6TLK(3RDURIiqreZ`_yuU;&`CJSO|lJ zH_-#RCs1>&4A4z_-WLqgd5xFKVI2#(uLzBGNrnlHnChBweE!T@^^ENL)`m&>*F^B3 z+SW&pmT(kSpls`Zab?QiXcf)Bw+80u2~k0uBP+3(gD>l>R>iAi7R~$ha>UYm0nbzx zR+6t}JzL$b=;QrVt=j|Q2?e`lRTvR%cu%12FveAMI#0Ys!vhtx==~qa(xQ~`$6G9H zKOZD}WY2V-`@Upv00W~Kch>_t)qbTrsCCix@CCG0UHo1#Jyi6G=WI1eH$m)Dt1za; zT0mnbqDd(jA5i{;d6&Z!{B4(x#Vg7jb9ZeJrVX=VQ(Z-nW@&?CKpeEg^`==-$z?!K+ z9t?%L=V0{_$C|!V`LtZiXp?Ap2(2`)dD@&4?mvQ0IDD&Q+WB%7e!vM0#wfda34AzU z9&7#d+lg4P6D<&{56jF#JS@uKb*%f_xmxrak@4$$a3eh}79?vAI4>n(-R_&)k!p=& zzRqb~8&mM~Y^joR{Tor5PJjOvAOBEWT3ic1O?wP#KdzalKv;aD6q+sLOSyu|u)mQx z!HVvHnb8#)q|g>%FfbS6v+l5i^qk>`XaUL!oYO9g7K0GWHAY;zmz#Afo zv718cI~9oHetV!)L+HCJw=$P`ZA{-`?#J1509;nvUHj>~3Ju{spCp3N7Y-xb=+!cXO-7i(PGn|q zewII?63c|B9w_>GMz$wbnfgTG88YC*8A@@`bBuT`mr?}#nA}6yPr5@4woKq?jP|y^ zs%@C3AoT1(zQv(X2l1?g)W7bl0JNHHaSsOqyNPjo1XA&CX{0on zg9Sj+3%CLXf+5rjTtQeVB!VymGz<+b1GC=9_R%V{hW-s$<^xKG3JNXUL0l=ql7@R- zDI~_Drbb&)*nM?qXNpv$bg=C=Dc9|qQ=wR7BGl-K@0&2z*(bZjFkz<^ml~Nqw>UVy1F?aZFBG`YtarRkScFq41Cc&FO_B}Whs(wIq8~GPU@$7^b56%eYJHoO ziurCBB_V>F68t*i&~TWvw)kv?E*mU}kMG}bOswus!{BBly6sC5g8R&M^)5CEWDA0~ zfysR>VnB?9;L1Dz+cTiUKRuNKewe~{Z51pOs4C;1{kJ)x+;we?d~6D@;F!)YC;rMUa*1)-{VDg}1Zq{h&uM{Lk>Q@iJ;%(!;%xcIQRs2ahKgxo~fL)z= z*&FU)>Rq;t;~B(42)KhTob?+_IZ9<@GfGlL4fKbPZ{B@H?g+A#MEf=)jY@#8t68d9 zS-8;`@z!&tD3qls$Iw?J5XPQS(>EG=b)>*icl(1TcKmyEy5-8DHjIR~ARJaN43Gzs z@2uEo#*>gSe8uSy;wt|d%X%sIEuF?bPXc(m%=UbS#AI`K2i5fhcIB_(ODWLq(6mj0 zzD^<(#i^ZGUt*15n1cNCE+E(s!{zelCX7jts_NFg5Z}&sPe^3U?Hvsr*52MIT%Lz{@^$ zF3*kKonm#}UabgwW8dB;joPOXbzW9b+u9F6OkyadsKhEr`L`BX-d6~zd(@NSvFT?F zB{04;N6YAs$@h6nzrykUd!5G55D$6X|3EY2rP$Z8KY2W7-X4t;X~Qefn|~Ed@#Cc( z=p}aS`w~MKMs-*^`MyXlyAZzLi&}8lCUP3UOU{m{SVn{q&x<^8#g8o>G>p2NHo^ zEL5h<5(T1sF}}~N_K#hH4E9Cf*DIXcF14o0V3J6%%rC)WEN zp|VAaU4(x40X$}tu=N>8W^4{a;e>dU>u@Erj5>0GvF$zpAZN`((-X)kP15g)6B>A8 z!!GIHEdRv9;0ze$^7z3WM}!BKlklnxC5)w+ev*C<@BLWfGN-K2`#@^Rk!}gc*5m>8 zVJbVuXz1<961J60+Tz%^SdkF6?M42!T7vTNy6ZU;%c{pMDK8GPppo$mb1F*5P^Efa zL9D1u>#~@@m@Qj}kF3q!#A_HL0W8UD6(A$Sn0y;`r|Sc;{teJ{`Nmn|&HI9?k+ z1G0^p*`=iCJ4;F4Wc+Apwcmh812G84f(c<)Or7PQBf5LMP;e|=+f70iC-q0zE6!%a z#>D3MaKPn$KpUAZ+lDCC2^EG%Hs*gEP~g~WziXp^aYd_STJL>r^2MS!*w%&n8@wQw z*_*`Eh^$v%jDl%w7q@?8jh-rdD8sv0^GmJ3Tf6u@;B_r6Oxdb*kPCO&g>#Gg#82$< zMLoNknY2~4V_r5yJd+&V-3HR?%ZEy9QlL5Qk-mXy@y;V$0f#3Zn z*vb2`>7M3bDYp$pq6%fE+=7jm#{)Z&<^94RSQ`ecyNM{pC%>wzz9K(f6J=_OtYL-D zCBrp=sL`Ewr(1)TcRSH_f%HH;(4KY%7X-DzSbXhDELidAlIf}7kcAdlo$}(JTxZ}l zz;$Z;cU-4+O<*9X+KQG@UqB*9yIR8Y4aLhtg)c9Rscy169;sK|oE%;_#MY@|Q3eGG zK}1_16HN%;0mLORkwK*RmF*diU0PkjMPag(*e8h61?ihb#guW;$LA$_xuIAIF;4D( zu$^55>ej!WXBdK5@y={pBY&9EXs!y#_xhMJ;C`A#OF-NSo{+<)T-ev&adEi!4#IMF z*@-tI@ZCyx2**&E6CKTF%SN6PeXu1U<97~zGY6k|DRjTwQqJk;bd1=zxZPq%qY6LK zz=;OYw-|%I)_!Gn>Ls1=DXNFx30bp4q>ZKH3I9AYvY1$rO=i$nIAoQWLxJIEAna(c z9DM$AxXitHO8D~`IkSh^dVFSbxt2KD!io?j0*`8S6?(JS!LP5hb{Tskg5{>wCJ&() z8~uyv+Ywd39L5oGVq{}9F->l^$^dVt4F+D>4x%Ok96})j$=_i@J z10GwcSuJBMlY-Eyx5%&h<)iO1o6@Ga2#v@DJaTZ2_G`=<%QlRr0~+JPjnv_dFN{!Z zHa6G$FVdMTJe_yx?tS+A2wp`@1yo|`l*;lU2RDYP#8$jF8gkjNZ?=1eof-d}K)`Ky^4#{0(>sYcW-wO~oX(9J{ zRpheRru}@k<;+4!Z4r~~MyesE#yYI4;klo(w`Wdo0B$Jz#PxAU2?^=X?ns&(fCc@u zZoagMoww?H_r6XUQigkl(~0Vwd?Y+y_24w|JSw8Uo{W|~1VpDu#e7b7RDzK=NOtIhHy_}mwkqBdmGys0Nd`{pL? zXh^ZS|AK?wi245w2aVFet@CKV4#X1PymF6k0L4c4@NGB6xpzV z7e?;;sp@})LaY7>h2|GdslVrvxV$)e`UWI9{3uY$A}-Z1fmb` zXQVjqF4h{0nOMpti%`>&jU|!gbH#z(Sb4aub!0-CZ=~wAr0S-0%FsXyB|SJHV1pif z>rII=d>$xHyvQq7Z!=Q>EU|HvR(ho1iW!Y{WoTyzKe2YNVR+eWA$y&ErN7}CaqH#D zDS_Rsc3<@!1c7jV@h;-!X^RvgOT0wVpgvh{@1nO4)b4E5YjY#Bp00S$Flaf)F8fRe zR_oAexcDJc$n$MKQWTDsPXwwsdjQy#WwFjEI!XAI_@1C+oI*~E+Yy7Du|yXVzsgaKYnfp6 z-{^lt8vm-MdMBc|!fx@5SkQ?2WF6VLFRv;Wk4aM%pC>bbNvU$V(J$LWTyWG10MU^9 zlV-pIKbl}!?y}nKt$R|0^1Lp1MKsqCc>jToDtTpg>oB|8OgoRzVa0~bB`v4bvvFYN z#ahkP5q<0@_M~jILfWjRFwa%G_@-{=O#!KOxI=r_X*x&^Up;Pnmvm>ILa}f%NUs4o zZG9j;7IJj#6|8OJ)af)&LGRjn({+!)>`V5*LGjw6m8`P=#W>;$2E6zfkiuh0jRuR} z{B<|QvW4zZ(8<;MPFeUOfqAbMMyF%gH^JegdAv6S(*Y$7A z=$|=Ja~JW&hGXodyl>iwVB?J2Xb1Q@t22E5{Zuf{5T}wSAUCdBTE%z-3>ggutNM{dlwGHbRv1;#nYj3(l1pAsg|pQu!7)*&olw2Lg})4c^@6q>ptH z2e`yhDNO59b{Vi3wB)?^I{TBfAr`-9YGdornBmGTO%Jf&0-=;vlTdBpDaoG;!0a->tNQ=x*5UD1&GXg5z(adELJTyof z`gx^7ednP7i5oKE5JQdr?DvdA99Zn)p#3N80?Ue7HC`XDpT~_$t#tEtgZeQ3hbW?8 zx>h`d1mVSDE~7(28|l3y040U_Pn6UM$k~EcgS7Y07zQNGQa#c zlvJqcs_2b6O&#Vqo5Ge*0-w0Ce3pE0GoWo3+dO9RrUw-U%|H|%5e?ZWAHy=Jh#Nw zY2U{AAYsnILZ4V%lup- zzUt#H>qhkLv(!_mrO5c-eq^b~!BxK4Y$Ya31_ zbEG#cmMw$)EXd25KlI%|mj(?ugJI#=6mO`N)+Va?oWTnQ#`#z}vpzi0D82)gNnVNuTFr%+YI3qW^N@G^WqOFwq_KGdb?U5XG|wEmmiQ;f+H@+I zFLukvN|f)#>a{?ghL66^$DsyHZ%g9NJK#JqZSlFr={NL(GWe-SIK+{WZ$HdhoW>(T z!#sa4BvPagqb%+JLL!qy=Mvgm`i81!rHvstqH~@5=lyd zVr~1MSgEMFL=P9E-s5g2Q8LVyG_E9-*W+F1fUq?fa|@A8F6PX6g6hAT|ICtXaX*Sv zM-|~8JeESY>I#))mz=(YHKg4`48y7mfnQ|rDoADFx@>Y2Ez20mU6ScMm-D`(z=(cAND|_Rf@m2p_HYLhNl_BSl)M2I(O5!_$fXS$_BTEZH+);c6 zUacGvjue_KDj)h}A{!Ig-X^3L07mSZ44`OW{#P8;<3LoJQVKo?#pePrS!4>hcBcHd zFzQbq&WF*>l;x}ot1QP2u~;#9kpcv?0$3bK_ZX?lE(s=xH_F|uW#p0_5WcJ+MVf?S zN$Gc83tK)HH#9j6SUl@)%S1Zk6`?^t?Y$!;N7vErsOHZSOAyqwx9N_&>bh(d^Qm%J zcs<1uJ6ar9BNtBoVXM=Z$R=pr{{zM&(d!(+;{vc6N{(BO&*Ph333%?Zi>)YXOS#^AC6DP8NCs8|X z%R@T;iK6~yYZKf@yoa2Hm3KI=zt2_w1b2{k8?$F(Sd5KS4sT5lL_^LIa>O1IJ5Iw6 z5N)eclFq9KT5s!ArY0ck(PF^&1aZoVlSn*uh-a%!1p@)5r{&7G`m(oyFoECsUR0MY_JK+uh3*aKuJcR)^HUtarrd70lJUnp{@-sqc1(Z@Y!!JnmkT|t>+0ER7wf1xrPme(7OLj zp~hup0!c2bS56yO5D~d+!vG{|u;Kha`!&iTOHlM7VU#B$G}*xG2}AX!`*n4qI8Kq3 zT&aiC;yHZd_|no<{2824yjmMkI{}@Y50-GALOkuo(31GXyRL z4)lI@vqu!5tHwJ!zQmVWx=#5^L>q(M6+xyGGl~LhJ^DWgomeGuAV5bIw^OPtgSHWtaTLeeF&SBmA_-&g++01lRB zt=B{{g)hV|&4~W3`icm-ePDE_Lxso*1`O#Xige-!Tjw`&PyZGm7!HtSL=StobSr~H zzgqGB0x!i4NRgs6QlP7_`fI*0LTpJKpx>N|SuYg-nvx@YqU1PGE9w3hB#`Wh;d4KD zyDIe`P5tv5d{TY7E+1mO@z<2x|DSFT{n#W{7(buV=6`9ko)~3UBfYZMMt9_%;Gd!8 zKT(E%|I|)!0zOzx^)J}aH#u#}v=Y!kFYk_@BxTiz-^~mD65b4^f&D(3Y6Av>)azRA zI_Cf1gdYK3HPm6*YVfZIyVld-tVV4#FZ$~yvjMy+`s(+oFMmBN{x5a=pC6;x->KV{ zmYOYk<*dBEV?V=0aF5yW{&hHL#zn6KP`)!hKzd@G9=J^oAB4!UMpL+d$21VF@{pRa z{eHC$@QFaYyR z*Qn4}RD0yjgaXRRMHY!4C!&w1@(XpI@;^?vBrK8n)}!3m9lBs|Z@NRqpSuy=-f>!3 z0_B6nCG*M11$_7{Q<@H%eK-3TZ!wL(L-XL=N2JI5JDC{xX&g6K{ZLwYZe4jlK<_ z7A3C&%l^KvX^Y?ONIpXQ2#oHOyswV)0l%yEHU-xQLhwS{fLHsX!}w=O!Za#1d`7kQ zyx$x8EpE{|jW)_eoE=h?xD2l=HN03>URei-0q0wU?P?E=Z?%+wJ&@&RpI++B3zfc}|O3BQrlZ5*5BqyiHmKbcp4^26aj939=lScOH+_ zu2lu7Q52A-q)cZ3l*uB?CqbYG?Omm+39JjUPTkgn?j)~gsG6wVie{xdkp4Y6w#w=Z z?Po0JYPmX4eEw*=h`6TeBVe(F!@Az}~)ymCTa6*fevsnw=fB#`8g)totb^Ty? zOpqY*Uf`Q<2i3#ra9`YH_;3KP-Mj=GA{NSR1OQ_50sF~M-(8PhtWz$5m&ygff)Lkn zkAtts!Gf?I>}T0miA(46F?RYL!2bA0sS$0{^NjzOtu|q6*(fbU1t{j~@dm5{EWhyI7c3a#fexfJdBI>;<#7BJAH^J(>g(d> z-Guo-)n=@hnJ(e&a^B>`<$C%;!1tQ6-Z!7N-flI3(gspEtXZLwZ$7#oZmK-qKIT3} zugg32Rc%!jINL^X!BY{{DEg|`w!`u&0uWJd^?85%{*kHh^$C~R&jh&*FklH5aNC#J z9Ed97r0<}*&gS<{Lc}CugpN~(R=EuPRDL-RQjUA?SED+$^WkLHz-qAw2K#k;qJ*Nhg{%JpC}5#ZK~igtKceVJ~=03Meb z4aT4wIV0HnyC;e1;@9K$T8aCiOv1@Vn{lnI29iJVqDFmaf8#~JEGF9j;V?x_(t015 zgjrSZKo!^@tNqZGWOFTfhOrRz1LPVUE}+4e{qa(Jq0I#eZzOwF<8-Z23-7;mm;U@X zpje)__O;h;m8wT}{DnS>h#!=`%MaQuecC-<)@`Wqq%O7lySmh2j-m7tAp$qvQCK@4 z&aM~xfZR#NPtn7%(MCPhjbpdX^9w=uIT`!74Z_cY%^g%b_O0j!R0)zE>lc$DGs$hs zo8J8qMlCU!eH2bX_5AUo3kE=tqJS*_eHzTl^>6MY8pnXf) zSZ3rnYnf^}Bduau!WRxn-OOY0y%sSbAVnEv-u$X?KR1j#?^ni5soC)4$tHd9yy7XO zN8d^`n-K_`vlKolcsR#|Xsq^t&FQy4j#>M|#qJ5&9XF~+w6=o@8pZ;SpG6b#>K_>e zK5HR@@@~Gb+yi(}8rvRt@lpgIvTMN}Ht3B7dSoLXZwalUfwep0yvvAv#BEEeZv*T~yn69nsJx+X_Kz-xFr}ZCy9l z(*!wkfDn)}Et_ZD>p$Kdi*`*wO4bA~!J=z;Gbk3nL_Ks9(2g&*SK9PXGRZp~B6n~G zWCe*w1bq*Qr9Hxk>kX36Nw}?Kc7LvBAW_;u)Yk&WLp$Bs^jdqF4T-91dD$D#Zm3=%1HYl`l3R zU7)2=c^l&)boT&+r?z+_L@ahX6|k7<+C%L}?i?0c6GPES#k93)NhDG+Gh5!?@z|9p zkCE3oPg~C}Ix2V#YnJ64y%$uQe^uCv3EvQNe`V0ih%*%6Slscb9|4a@4t+?ew$|zp zsdtzKT*o&V_^dh-M3cqEz6mjA>#i`x-0+m)dDGCu|k8NGQQf+ZECy8It-6?03FfFKTr?-ZfbiSkK=Z;HL(~#PoUK#yFf!ev`~d zn{xf?9=R0P9mih``++CLN8z<&-`0Xk%;CJL&z`7dRUMv!B@*Q0OtStLw@}AQXOa+> zjCC5>@b{z&uc|Xk#UH#9*w@Q@9u{1G7lIqCEeX7o|Cnydym^p=H((0jBrb~kj4bvdT%zLQo? zq%AIdqgUvhoFS;VIgK9AJp+N_u}(3V-E)h1o}6;uiXc>$}-Br_>t|HGR1*{-fnsW!Hhm##--C8WIB zWu1c(SU+kw0uRu%1g3T2D|ThbcjPpjK@1hA9KOaY{ku~|4` zWsEeg#AYlHOa*9hb)1LW7LUxFXiwe_9M%&3w)br+E;AqLRw6)HuOPD&T-q%{Dq?@W zqVvCcW!jF=*y>rbt|$^vVYv&$n~T2J+o-bb%qv0i{HBTPQ(_;LZR0vts;ZIg0Vi(* z5)=z|Lk!HWBGUhGuT0=EtI`Xb0rJ}^f`*10Wi+p)sTco(B zi_gCPAl4(>KDY`MewXTLOyAABQs9Zk__l;{G>s$k)hTHqCQ|{Sw^!Hcg(8U^h08Bs zz6@Rk9G8nQy(KhDGW_Ey#e3XxL9~j0s|!(a>-K?1Anf$*dci=0B9ByN-V-OR2YwbkQ^<=#?{{^O`&M+WoH~k!e~;a$Q#pGD6uetJ#roi~J)|B!z*AL5*$GQ%2hi*;CmYhu2B%#}Nsqt~7vBAu zjrE7Ll+zJ{tqhR1W!M0)eO3n^m&^KWa7)0c>&s>~wr^ABiO7EvX12H>kQ=9guV z==4SikbTqdh3RH8xmn(;!LDB)&lM1xt6jV6;k-4-6w6$N^T=&8OXQ%O4TTeuev?p> z<8lcaqtCbLBg zDr9s4d7+t=*-!0r1s0Hjn0<1*I8evB=76M3h__;>3|Uh&X7ukE-qau_Ly*e4%n&4X zC4>VqCi~|iukJQa&6s>=kJ?OIL*cG91eK9QrZD{aburp*way1hXdqLR^ch{TG(u6 z2F?~m73XRqFnQnkU03*5emojT>)S9t7#TTJXj>dOr|df~d*2~GX@ntx(hp111)IXZ zJ&Z>7d=`R~Qd>=?$Rs$Du5yHt&f}SJ@a*%_PsMWd?zNqzW|u&iooF|}{Zuq7G;9k9Yg21eZ1V)&7K%~@MF<1?QV zoKU43M!)mycZa)yUC3Q$>kx1h`{H(IZ+|22?Rg<`M6<@S(H@CVEEyV&UBAJrBUKi< zVka%^B_zT5(jZH-EKgjKhzG~#rA5qPk{V2YuImQLJh_dh)QZ@dYrds8zYW;UKl#Km=); zt|p_4@jBNgBiG{*EKx#-f!}`^OZ5{SwJ^AzjHPMgweG#&IG3uwhfRZFKGG~XSFM}+ z=@ z;n6NKm9y?${8GIk1U#mU$xVLuW7KqGu#j8Ybsk}j1l*4-5CAnpb8e&4#=|I>PjsgB zYwa^L?s*+zTG|DF!c?%~mOlAQg`Oez#u_mZG7!en!Wcs+MhFdmA34oX2labN@2ppO zBjDOHAB?sLv9s3O&irB|8zIMisrO@pj3w6cyOSUf($J9&t3x%lv}t-4%2WXNzL-6b zGNKbY_sz1zazdic@PvBa`IguL2VvDK>E;;(sq~s_6HwTvxJu`(llfzttrTBP4S8Jv&I8w%1jpEROxPJfL0_BM%j zuUG&6p|l4lCa1U1X(kOf@ut`Qvseq3HbnUtf=Pd_{59KeB`b5)d(OM;@0@fU!JKk1 zqLA%5o#HJa?V2d|Dc4!T_tr(U`)ybr&u170yvO*ce*29%2ZRhz+H*!#|9lQ!*ez>iMC*I&BGfWnGP-yg9!?jom$`uUQbz;&7?~HPA^XdTb{VHNDlJP+L&I z{`FNozR}8}_8znX;&9Z({LBkXF$2E`7OF!l<8i;h#pi@rh1UVk4!7q}x})9-d9v$f z60m4fM{WF#rvwvq2Qd58oW0_?@DIOwXD-$yMI4H=Unn1+Hl*$oeR;g9?5Q8Fb=bWH zyJ$Q9bHvE}vK9E2`eYA()bA@oC~A%$9C)HN5rA{uD;De^fOXa|XD#TrF5%@pqNT4z zD5Yo@X?5w;6D4h)p|I6Macln>Ey3a4l46#UiAzphHKn`53(-T8e!!2%ZC)((`F^KE zC}!sec~_(TLS5?H8b?f5to1^W_@_QWtP-sW>I7k3o}n`pd+l*4A+-uaK7+?yfpn77 z9d9bi5At)9I;|gwP-5WfCeJi9Re%J+$hkkHr#lVe_=4zR%Z=iO*T%pkcjkuzp- zj_&rHhj#_W;ne|$MfZd8^!ea)-bqy7RYl##(^MF+=}~tZ6|<#$gY**k2|Qm>Ex1}KrIwbG5|5Z< z#>Lxz4f+a8O_}G9j2cmL?~n28E}Kn0biSo=#4OqD_$YaIzU}{E@2#TZ3b!@w;O@bK zTOeqV;7)M&!X1LUI|K;9Ew}{_?oQ$E?(Xg`y|VW?yU*$I?;ic%^+j`448|yGeYI-U z`sRG*`zW&{5A!5%13%Es_K@0BPQZ-qdRV2^!KD2t9Jil;P-!ZAKkUuHeb|Q&*ebXf zcILM|qdL9my*hjON+Mj=P;I``zFPcY7i)(*3oV0*x5Uq~T+l7`-x8xkQyf#9%coSs z&-YN{KFTNX*nqcU>99^{$h;s?7}J zk>-P|Rh}P0;7A(Q2p2DUJa9)Z1MsS<_FYSo@MOYJb2z@;bCsPR-pI+K(aR8oqT6X=?`6 zvmThLbi9Xn?OUhDfXJD#D+|L=3%-IL&EWF%p$;fz-GgRl^V5G$X7{6+gWnI&78-UO zt(Slv2oCqS9v9lGFLPeHt@64M9*F-Gr=nC`S^6j5p3aB#xY^q&Sca{A;CRogjLmkj zyYi1;wM|JZe&{+rsX6gaOh}q86FL=^Uted6Rh=Co>N;o1d@T#mr5uVSzEU<@i_bD! z_>?~O*uyQEJ83z5$NIK1=a>pjPKC~J<3;06?|f)EVgwggNp6Bk^XfdOSMa*gW-oWZ z_Qi~VK;ZQ{PE&4oa?}7~i?t!)Q8B%w?JP?Mc?M@AHDptNsm@xdQAsrJ<9kFcr?)cX zy#$0x8y9~bv$s(5sjVgUF8PI@QR*LhLR9MvNaf8AQ($0PYMwyz(7%M$n1l90VA6@A zLzbKLC&-OXuCAw44M!*uZtp9QL%>xNOp%`zKR2-2JA{9!#~nrEyZ5`8if~2RL5F-! z^+ZXn_cDE^ve5;PRvK%94RS7;L)aX6(?n)%m-XpqN+F-COPA8O0V4{3b*s@HquB;* zdau=_6S+<&3LUFRSyh;yK?nT3aCylq)eKdPdx$hHOekvja+v zkzWj{k48HB_nR7|(hM`n#D#RvB5JjaVyO_)JLEw~$y3t57+Ar{tUY)M_j+5a;*EG* z9!}#z+SdddZ#6ln^nVj^z0Jq1bGclW-TZ~3RQWgOOX}BLyo3x51fcifmxWgPiohpH z{C^V7u4wg|&N@!wh1RRSxefo#O5cEbGJLr?Sm-h)qC4Yv9L2cK^TKg@)Ke)Kvn|Y{ zR3j(`iKbf}O(=2KEBUSDCH1O2y3!GbEm4KHRWdR&j2eMXADaEy``SQz^;es9&rgdrIwaME35 zeoeJSAncWA(x3?F8k3YTNBMg%@flV&5NIlvlnGe4^m`mjtbP=YB{)sHd3m)`}yCGwCjvERs>HFX5u$j)87A zpgT5J(-9ZK=e`2#;29#=E@tfaC)_yf=F>He&xMF+&`jn=@9Hh4~qyiZ~^1^eYptLqfHEYq- zXcfr(c|-eV0Psv+kc;+TbUn4g36;#*vr!KsTrGagVjWv3vaAPbpsAo5iT>D=Yq$%6 zA&^d>DMK=MQMas^+2uL?z4U;DJ<%g8m#Uh^bHuNWdVMl8CF9v!?UC#pAR1CtyhQjc z%++Y^IXt~fpInVc*RODQ<7-9JT?vnjbKB+0E{VO_h{T(3Zo@H^CejL(yR*g-G)Y{{ zo5M@4Px_8dXsL%kh0W4NEt-Qm?i@VYbbjzrb$*L$T`IqRRshtE8u$c%2wtYw3^?(! zn1@2z0bCOJ8bsb_7)+6|vrT?PrpUoaXTImzMS&P=0{Re%f4*2GcU6dm5Po|b$ALz| zbMvPQG}Ek|?D@)(97N-JN21@BFiNzPI-~$6$c6BL$%<&t)L<>B18URe!s;8Uc8_5S z+==Upm`EORfu+leDjGKkNpGQH2$=?Pr1E}kzmn-`y}Kcr@p;|F0a<*2 zLM+lDEvNHLeq}`{`%gKUw;u9!^d;{6AZ0CHFv_fd#nJ=ACH&r`Zh)`H?FVd| z^gW=AXE)2(4%w!ko#VpZG{!CNg1X-uZO~=Jb>xzXRD)75PMgJ5vO&*FhI5MmXs`H! z@2t~b+b$2GukhS(ZTe<7f3So(7zZ*a|FDEgtIe_6z-wb}5g~v~K;ks)+vL{`Xw8fW zyZiB^Z-5NFjeFP4{G9Ez(BrD$vLaW8u_m1y$m;3^W|y5t3K?VQ-^?V*9f*F0Z%5!B zEM$PX?-4dSeb={AIAldQZ96p60Y>>2c46Icq){YI|1vDFI)RumVP8}PXHL$BJg8X1u z@O(~Kx7-z}n*vJ(I{Ksqm%<$krw9v>XblO%0!ZC9!hIlcraGKySfjy}CngX$xoD`j zuhA$@O~vItIlXeXYiay^zj(g~(PEGwRgmgqz|vCiJpq)rw8u(nLu zEwGFh_Iv(WsI9Y;1|HPjnSR#>1)IdevT+kC_tSqc+mWRm7H72*slqL0tg}^s{&CBu zSy7i?sbE+nLv1Djwn5m6>< zP%tuG-M^__4u9}OJ`rAC?_T4dPTAMdsg|tE6h}I;K%lidVC7782nX|=y&vLIAS`VV z4MI2Szz;3Y7c>`*O;qN_bU)mdG;0#ex>kB*j%Sq#t5#z_h4hF5kp&TW zwc878{h!+-=9AqQ5hje=0++Me;AbT^}<4q2EK7$l`{I@=SR7eAQ^oQ3+cmf^zaZ(hGN3?Hs zT+1({IkgHy3?MEra!?}l%4LeJG9f)i%sapzmFvCaJ1(-FA2BwcZt2#&tYyA^p@E9t z2xz5neU@lLP^BTzm^+DENmrx1>$wZ`AsLh;#pj*oX|FwP`?T$F$MMr^u?oeVXieVh zN$+&8C26Uj2gj^4fclhoEBRt?4Dr|RB4@GRiLTzvS0{1=tZj0c{8Y))RLT{8u2E+c z)xGv$9@l5PF<9bwrj_mnB5Z!FoQ*HpmmTD2BU=1s!}Hnj+O4Lg$(`CN>Ic_ooFp*%zA2v&Y{pCijAR`j3;q>ZJXM)f_7&|hb1y&`)Uq1s7I=6 zVNGfAl7saA`8Pi`LGlp!fI1Bo5hhaif`IMo73r!VLF|^;rmd|Tc^|M5!0*- zhg}%SxXvmgGWU0HcZk zW9-xKwC{%mK^|}ovpVvRL)&UGKg_Px+ZOiZ{($>=-dnvwjD-ulMZe!Et}(a8ehxyU z5zijPEJL}oTE}3NE95=bzN8Qw`s!VPV37PR4Ax7JY6Uo_nVj@*Vxj zI&Ohjtpz)`?|-$HW5&GOsr>pq9lkz*slu|Xt+(W!VBZBqYP;qSr6lByyakpp!Pif9y z;=jGK|J8O+`T@Z%rMj&ai(2mMRrZRbgUl$kTPYDX-GJgTZ2C_zhT~SZ|%rWY? ze3`~vlU^{gUQNhNlAY2}RG$^=A`IsJ+S@Dj+VK+g-{?}9eDu4jsRG}`a`1!LD}Z=3 zP>&HkT#X6r^kw=Wdb|QuX&m6&7%B(9eqckDkM_fbl+4b!iyc$2XWYgII8!E#3cG9> zH}Bu;p3=t^57lGFJ$-aoPjxMsVeeMo%2t|n#4%m2nngm4dxtd;QfZVa{Eg+-O9r)~ zYZc2q5emUkulGj%h0zIe8NB#xsKzZMKAhxluTQq)oSeS<=Nnmjf$8mzUY_Ur{k-I( z6FK4{gFzjW^?j|&eZ4!jmsIFq2D^d7)@3mr_^?+lmfkdQcv7O-stITnF>A+aoS)ba zmVEu<-T=3GKd{|Qd&zC{M`exAzT=D68H_LA=W6zSz6jx+%P(EdJTVwhFRb9zI4BCE z{p>Lc!u)x^lTp5%BN`^WyI;TN6Xyd3I{=5o@36VyO*k#aTp8etFIVye+bpj%Bwyim z>GljyDDWCK{bx>)c=)1_SNC^3Z!Y+X|NgVu?{5T%xO9vMXQS_>B}89Xjpywg3Bl~^ zo)+T`q05O3&Af_LHO5V65XrZ@TDLV1d?nVFH=52=Lr`dvWnM6p;$^%nW~VJ6`_*(- zapL|Ak)JqBpB5Zt;)?W%_Mg6lB;$X_+*%H~pzMpZU#0k|+x5qn@ zpD;`KLgQ%Aq6Jw#ezq6}4dh>{#cWbh%6If__UoLA3*4``Mz+NB_zF=l0qF7)ktjc~ z3qizjU+$Me<9+@S)9P}%xYI4vI-BRolrz@<1R`ssyJ}suC8oi%s@e!k2N)Pmy(~M$ z>eiq3lSHq6cbC6A;7H1n@`F3esT5h3)h|p@=hNf_O|36 zqan!59^XoJt9wJSPWk1&+E98zD(1-Wz78?%njjyp$>+;vY(E3K^)DK&xpN<7f4S}r z%d3sOXF!7r%L2B?A56Z@mfVTOpNy_F4ShV>FW0X7>SDD*u5s3?4*2NtiX9?pI*jq$ zdj=CGZP&A+ig&PPGcE~c#_-{PF;>CRqIaf$+w+(B_Su|vb^_3FV6(t2J1!$ zcGfvO^NgNcCY!8zxXw@NcPAeANo4``#rA19{s=#`Ff^!95MHlY}=L@px~}?kuf!>mSQNxb3IPDC$n^Q9xV1y z9}JJoE+ryE@@xXdMy=wq$H1g?>gDlyNVs^7Rvpjom#-ltgVI~N%y-|FDm6zIm9n4X zs%*z>;^bV2(93kn+l+efj-PazZPEt>wmG+Lk9uz$s&9)wzq2acFqocbUVG>MGIR0q z#kp8*@J`$(diCXrXYBdvVoQlsPgeubpp<}t_F^ih?&%g~Z-lWtsYQG!I4xCEHQ(ve zCykbdt3+zoGRmvi&j}?!P_Jn&xvPHw%Tu~*gep8O9W)5kaS5&L%WMIGgx=lf3@^{8 zBRlJJ846UGhSJbDupvzsx@wdHdJrZL$BU|&li5QL1Y{DrUi3}iEPnxcqMSJi*`o_b zP%n!Um`fd^X?iu)Kd-G#KJ0T7R*yxQm#hbZsiYGT#KP_QE+1ryXE&YM7W3~0i?N2U zjGPdr+}dZ=QW}`VK!0XSf@E2}arO)zV2^~Bv3~JUx zDF;tQFR`_}{+{S=RRjXQdsAxMdY1_Tf|{4NJcCU@skcNU?%HAIuN{MFBT$2S1!XHT zTw;yTSi$ik@20e?Ob5tYJ)9D+U6;w!^}K7R%K(kgy@UMlw`-24+o3+`M2WC`SwP^@ zXj#u@F)oM$gmUM~XRdPHO;s9InH;GdEN2Ju+P%HdzSE=I8I1qsuoxWad83MjCT!VP zXKgZ{C8T+`@`*#+w>6$leKw89VY~2w&-pM{@NJf~1h3!R%rM<$!- zcko}8`D~X$^)K-(^Q`f3f&O-iJjFrjxfLTQ1|IU zeMbfg(TeF0I?7;h60od*Ox6ej-&f;IaJyN}L$^;vL zNV}brbeFciiguB<|UnKrMg&`+J#RW+I1nsEQ2B{leh6roG;X>O8|U{bp{#6*Pd?$JETbyOxhcX5t#JU zTZB3vUVoBIWG_YDpPSP!)@uHRyuZ|EC4zxm93)QS{#7i}t%EQy+Sc1~(Sh0a^=YzI zOT9=VrzzzcY`*H4d$taAuyWN_9s|w_fOs_1CTBh(gRkv?x97o>=)#gRX))wHhgQEA zeB1Q=+IPm{bS|izFPy-o9e5>s!!v);gPT9Wr1K}<0Mon=%AENtwf_(kSR2+NJ&9*7 zXB5io^1!1y9cX)BJw+n=7AilBq-%c3Yn?Sx*R>ZdQC9z!0*`L9Vh}pJQdn+~ki_k5 zQKVb?Cs)?9SP@l!@M?U6sh`hvjR7zpvnFB#>EES#JY16PV`%vYJ45M9T)ilm@BF|0 z*|X-skRk56`W{qrf`!%Y(H)3V}DMuxy`SW@$wN`3M$oa7I#-J)JDpbPs3Ctm_O$ zpxhe)f_1Xh5;wuMUK%$YBLKZDB0{B(J@inZpE}9e;b^2j<@WWXDtr6l66!T@8JOIk zD6Ws;TdcD&xqPWYEhLvM@Ax&LUxi%K$DD;A{i!8~=Tyd)KC&g3q=fInfnh6J(1G+F zKKuDl%ub&MRLlqZ$4pRDqrruxLQADDC6 zJ#9R_pqlY#8_B9BFuJ#O&uM;^u zS=T3vbhNDvF*Ofg3miM&_k`m@A9*tSuC`o$Kg1^;66!z@XgX`-5I>S(n|5I53BOOwWF@x?5%2s@NZt=h(0Z_kD= z{i=gqE}h%DO!-Kh`X}x1aODmi|1FV#&$1QxYSnYpy>!|OxR9c_kA@N&=Df8FwIL=H zb25M6u->&?DGOZadpVD^a^p0!S)1lqV`;v9aOSRDn2RBd6EBspcGBD0W$+r&s&lv% zv7W!49t*^hrE`n>ct8Ky=|abjMi%SK{PCJeyUB^`{`j?Jh?h50tg&_kP^@! zo2}I6u}JtfsX~_tQf$xbaa_%ynhhFoGCQW*O25-5!B}$wLqMIBJ%~P3?x-=;Aulza z`L1_=Qm>3H*9nOnZF8aY3d*3_RW8PJ9R7v$B)dU6`n1+3*>2nQYeT0Q%x0odK+MUU z;cK5ODb2t;%xoPl%Z1NrT)1F6-%wQkHMxp^dnwq?#O7wPb{8LwR(y z34R#ei|7>ng!ZHzWttS{bv`B9BLwylHPcsh>F^(;g5oOA^H;1B5;=gFlCCIzhu>;{ zlx8@b?ypAq+0pk}xg1BB`>9TNTpUJ^1PkSPGED`vwZ1oH$v)vVBIBmWGI)hV6X~;@h z{Md@i_ib5&35*QD_{8UrO^@uk2^NEEYStbcM0ww_r}B_JZ+k%>v4=^FM*Lg!x$(p> zpG8i!xza^~N`hwR){+h?M?)P6@D|b^cT+1-$2|9}zqm$fC|*?tz@#}=eC$}0yUB)$ zvj%Z?YC3Fm_9(T?V=3tt6d52g3Ku%c1qiXz9_>W+G6jG|RVeRYyW1SO?+%!g%rMBOJ;@Qqj)O34A33B`>u;Nn<%* z&sUC8*~R#++p6w`=h>xlo#alJ5{$oKezkS`|g#yRR}<01}e|H zsZ*SiU1(dsVMzM3MM7p6y0t%D8N)n)Ucj1f;q1R%!go|_TY}aM!!uZ;{nQ~PD>KAb z0|$)ev>hYdYZO?1&6$D6-Da+}h+aG47m8=CDnR9hEQF@zq~9&VJPzs6YNW^5EDx7I zOzjXeX?DjDrDF-Pw~Zf#@x3-A2b<;8C?oMegN`Y_8$s0-Ddn-)Z?r+2RHX+K94-vQ z_`IDqGZizg*MW5WWUw24xton@gYhcOLb}zL6^0+o7xB(S>&o=$vf>^vw+P zc|7YvpsQKQ5weYw(`k9$dz56&SN(JXGE8?Hj6=8i!=)iVMYKEbJ=)lbucP`tzlJQ^ zZK0fsm=av9qg^m1+)@8oTS}c;rVmSGOkgMCPATf&cU@8_lhH_*jFojc6sA?4A#X8E zC@^{06wS9Cmp1Da>Lp>9&@no&ieqv)WxEC1cAD?**}uJUf{pQWO&GOFE~DCi2bsx(X_2Zv62MMd^a zv*Qr{vTt`$#1~8;E^nN*mB*Nt0QAwA7_lgI}k8KYQ~#$Zf!i z{^Su%HknG{cdoi<4g54<*Aykd=@H}`tgyFy0JAt+G^LPLqg`gYa4?zA+Btz0g^Tk3 z%V=eBS$O&yCk8n$Iz3;eB|an(ZdBZ#ok*<2yr0GQp^id7p!EIEx8(bvKWnSeib+^(rh z=a8uf_P%Z?;p+K%dWbRVfOp~QmIIXhbTtN+dGDHnayS>KRZHKINGvi)=GR2f)}ED? z!D4P$BDWc&;J0aTl#fznF!eEA?xFaxXP}(lVh(TP263+Do0FI$-(%qeUCxwq1?IZ8 zGuF7Gj{W>QUmkP?=9l~u)MMsoK}zUYLp~BI;%X@2Y~vD%3;~dKe`p6}yz9f&F0)k4 z-%zix95G9A8$XxJ4?KwtODIb?3oe&Z0uSSB*a2}80MsX>o)#znBCmzdN|?$Cq{}SA zGL4yTZPo*sWxc8h&rv1=X+1Ks-#e#?0%|KyIb&=@{P$7qNbGMjt9KO&dD9uX*1N%OIEa_9uv}|m; z8nSN?d=d)^OlGP2h6^l`yP68sZ*c5{Hu^fr;sOv9b^!VL#`vhN^e~X875{4!_0daj z`+#@oO~{WYJXCK-7umMm;Z!p*m}OD89O!dVsA5rQ`fmaH<$bpc ztjvQjR$cWFug_h9Xv?;!C-;U0_VZCh;j^72(j2A8>om9ZSk%LXkYcRC7CTBMD&{Ao zqw0E&Tmcz@znQfKsUUk!JA0&O9_dOK>dq*aYjL0cfgEbJCt{>6k$buB^IUmWzM2f< z<$xv+cp~Ll;Tu6#(8b)WY1DSNGU4bxuRrQbSr;pX1Os8g_L<&wGx&KTW>#URdy@Jj zHsWYRY_Gza_15JSKar@E^JSDI#SiLi)`*LP$E4!Pu@#3X0^>-BvF!i^O>otiluKP{@HegE=U1zv+wW2t2hoC6)X}ouO1>5P{u&j~Xh=t0c|(akEV@dPfYt1ev>}s(R0@GY~ ze8Sd%d3}QZW8i^<8VChjpmKC2@r#%{hZxbSA?;v5P*c{>b`kE)ZQ*?-;!!`dk?}y) zj+oGLwY(lH`A!D8qarH?OsG+BM=cK*MXEm%Ss4ZKs_kHu3;@8i!gSW%Q%zD?Z(hM+NN?{qY zrvATIU*M8rZvHY|l*|?1VMrJ2&Ws1YsHJ-wY|~)D74uW*cl`PD#gNdkOA67w%2$Ey zbF^QoAluAUzw2jr`n2#JpIxMt;F}?q^o?x(`f%1ao7U0hvG#h^CT;g>i531*w2B{u zYELypo&O>L180FSM^+>z_Yq4`>!Z({5ue<@nN@<`;gCFB>?^WDQN%yvpAv6mj5M_= zg7tPAOQi`bI)yDk7Y`;n&}6aes&%*WSS*ZcJ+M%qQ#_5F%{WX(dAn+$rYRg59?9LH zx->t_|9bb+3H%(lBcrndOq1t;(*OM2O|BlJVs58CEJPG=Fb)XP zII5{h)F|L?U~G-~_R-f@VFRg6hT_>>uN+9!>l%`m5x#b9(y>RCw{UlGzN?j!rc-ra zgoFEY`yBN8u^W<}+t3`6Q2al+yNY&r9G{BOUmG&`qp{-Qggd-&X=$CzwJ5aL$((Lt zwRKe?P_G%h&VzF`qSDk1^gn;&Q2qnEgWLFpOW2}GU6DP?h46jBFVL3r$d!qG0}O&D z^eL{*mqd`WzIbMDC*!=_b@2(~HQc~t>JZ@bL?Iko1h~73tOy-H7%%%V5}*4HvAYFP zyELdKW}u-phE%c6P2Zm#@Vo0n*mp|fMu$BH&H9VBQY<~MA$T|3Q_(QH;kBaHe-d|H z|0M3}p|%Bgeml+)_aaob@=uoo(47%(K}DDWDY$_eP@}{%HQ%ESG7eVoxP5Z8+m1Y& z%MbO~IB$f}c)r?_OI;cW@${WkdAA@4gI(PR|%M1p6}its6n{f9sFGW74fj63qbJJRPRT8k}7XxFB=rPY)Aw8 z1kMyV6cuDD8R&3f+6EUe+F=6pb%8-LA7-iGsXM zyE}xHU9$PRyU4X%HgHmBEwQK0 zfCTci8Kaa1%G&%Cpb256T|ax@t&~k4 zDezT0q-H&?w>1ea5)KIzzgf~7V(yUTXPyMQ?v_)v@3YVB`UWpXD;y++O~*O4A>0#o z#s#u5{7Mb#x7$lMQD6$qhp8v)KlziA322O~A&tj56zoQkeTMi45jW)k z+4s}?{TbGj;L6m&MS45sTYjn`&Sj{#=z6Qfs>%BMQIa0E@6(4`l$Zi4d*!GtK?fu7 z9c_TZ?1Bd2`^R_E>r(~t$wiMHhMkziH;JAbzxD0i)1W&sBo1#{P&Ykd`_9ugj7#q8 z=s2^~M`BO)1VtsTQ^z;*dt4+Nxa zEmGgdec91;_!}z;o`l}&DmvqL4Ojj%`$7HAYIXd7&Q^!63cx&ZEq}s5d@m|gMi|hR zguabE(is(;B=e1Hz^5%dl`aG6lJU|^B=>Ayw@(jKA{07H%a4i#=a=Ig!+qWP-rI_Z zuJ8(sbeBGa^UMyn2y`1h6wLCKS^8smkRs$vsejpp2eNrr%8Ohr8}e*syJN=+^mDbA zK*a087Jd{Fk7>ia$2?KrkoWRMV>&Ow*NCyNVI)BhZBJE;E0kAZgK%NIY_<<+8XKrT z+X2zk2glvtO|-GT?GO5c!nxSTX};X^AuJT}!dc^5p4gwq0_C-b$(t5}Xm_6NG8E@?$`(CzmIUk7Q5*edHxqO&P*WsfUnC^?3? z=q2MxYv7BR$C>^?Llm6l^@@FiEMyF(-&>CzKVEh}eE#r{YoBoWe`0Fd0sjqC1DQ?# zsse85{=up>z_vA4xEU0#t9=ZX5eR?AKP2YFJzRL+(RAA1qL|ZI;GWzQ=MP+&lg@a? zRTT2LU|^N#g5^!riutjHdl}Bo_ZllGMe%M05{L2v?eA)nwDe2AuI2iEXrRu=)(iKm zDfR8rKiF#QyxCI}2>FxKx=cO);?Dbk{)2m{JItS-j2_FnZT-k@X)kkV?a+}d^|7nt zP)=C!2}V#dnpHTJ)dM3m$ZgGQn7JW{qnFxUpjc4w{3boF8fMxU4dgQ&8b zfLBN^{D!5J>^8Cb9_!Dzr@yssNcJP%O|xH z0H{@~y&?Vody;=)TmM*-1mzSq^OlMK-NJN0E1fKX*AfFVGo$4GEfS|FyS_@Vfywap z|F|*W4F127)%w4Lr&BDFS@vD`Bjjbf{;u)kFa*e1m41Jamd?z zuFNtAfMGl_uk<;j&b+9-Odzh{`8~{~r-Oy-io<=PXIpI1)av!46cgtv&}bL=RU0;N z2Ru^(FnWTtB&K`bC2B`@7@8viThGAX3D{7dYJ?`vzlRB`e8K{Z-wZ*SG>kn`2{F7d zNRmCYsypkhYmLOzj~wcy0OV3{6}kww(Bz$ZeIn34@aEScV%*g$JL;pfCMb$#vhR&% zk_ZT|Jw{JSe4ok2YR&2H%sfGt*tGQWY93N9C#xq9kMZaj!pGuaI6+=2G?I@fc$6gc z_x+(EclPJ&LS}D8#7e+23^A*`?Y+z>Y8OC4{TW(8t-6R;HP;PchJp9vUovhV zTLGH?#_3B3PZ08KZV#&i;1&X1OoRYt?1q@y<-Pw8-mO=WHh3s3@yQq1Sxxx{Wl)|O z1bsYB|j|I9{T07D0oQ z-t3$xyXaT)-)S+4jt)`#f0AOPb1@C6^~(-nUN1Y02a}b7TnIZheT+ILOOevl?|Vn6 zqM`*mvF;!4?LS2`m6zx>bI~5}$5b ze6@CVM8_r~)y^{bj3>WeIBZ35cr<>ummn2FxIUa%=?DF#WF+r^cNAwhlkTjQZ)lmK za;M=hH=@qt+K(j~_K~Jp0<){#{mchy`$oMlwa#tjPr~I(3`wr*+iMJER|9VHJNi8F zNPG$`U3_X!*R%M&3~w=@rbnI1ZVPDq0K!EQh&0@pt*QS978j>VGUN}nCUxX;0%Jm$ z9ra-fkWq@9j^a_sYe@G2MG@&IIi2l1MT(vASKIBWFU8yA8(ovWU7#ZahZ3=Wq2kBLDU7lZVlRbYRAS0Thu!$TFq{cA{W7(8<(woJc|P8F z&L?Fgs*&36PSd}$Dk04QPBVGdzs10~Osy3ba)hr$8XfJzo(NKm=YLXRW(%;&`v#hy z+PPuOo%g4?eo0;dR8#U(hZIZ>B``E@FVi5B&YFyHdG;56Tt)SID#hAL)bn9JXlD49 z7FUMy^qka-$bYX@<&=z%FZvoi2GnrP;ggFLTKH+CkH(PVLU^W zp`YeB(S|rcXqy>s;`@apPY-4T(PV+Zj!F$t;5I3dK8dqJ!R@UJhQ(=JP+81YI#Dhd z^#e^`!q8#uUg=L>>Lg?e)DPS;=O`q6k2xD?bVkteD+9^}k_i?wBQ(UTTD2*)CI?kJ zy&l=)TRk!@rxp|bEQV04d}SR^mcJvKm;Ma?3NyEr<+u}0TjjvsDSPICmQzo zL#UI%%?{(;eMSf}0i*SzE8CvEbs>w{Rp8}ZktK(6q#|R-cQO+UzoSTcjVg(M06;pP zC$+HM07^Jw3lJJZBOGh==Binx`AhJH&FH8kuey_~9>wu=%9>@(r`!X%=6o7;CC4jI zwdT|l{!RtHV=Jyl>obJrK;LNDeJ}kZ$}5FG5*51aO^Y>kz<`lHx?FFSwJN7>5=Q^3c!#WVNWb6 zFVc|$ts0heS~4LPE745T(97rrfBCb#WAXRat)JoCj@r{BG&e?AF3ubbv*G$8fiq%2 zL)&X1zBV`zt#M+7LZ?Unc5k$&Q|ne%>-93i0TJh=XR-#HBK}?Ck&vdm;qAjo&ip&+ z=GQWa&E)Dec5l=)lE>$(09QD=X?+L6O(A~qraG^{>O}OP2#?f%%d;X_e^1{s5rIsIugKiUSz%MTDp1V#t(?VKaXRR8!fKXyljHIr9#}<>+&qo-rRX`(t zqv_UUv$lYBemUYccHYXD6?4CT9w&=^jpPZR=#Tb?GVU3~>ml9rsn2f-;^;_L2|3Dm1qSMjxz&1vpE)&0i z_H5DR=Nm2!OjhoRK5qa-^Y=>Tkhd8q^}yC*r#P6}Tp#mn?cOMS^+uv0AnZbg**0&BwQN?wYu#vu1hhJW-QzKcCo(vSwj^Trt7kBJxt* z^u@_~SNVmqSuK=W^6wR)6VaMCheH55My9-$MCoiX%cf>ftX7WT&bWgg9|b42ACJy=?lrHZA}V<#_WxSS>A{E>YAeNNxESKNkK4uPxKTWC#zU zO*c^NBmVLej_Tp`gf~yny143nxbKPc!%Nhf^SV*iC*|xxLd?#G{70hW@lRYYPuIUD z^Q7=URWdQPWk+A=)yJ*sZL%hb!`hJi}aYp$6eMCPN4pD>PUaUUm{$ zKv%juMN*S>B7LBGanFaJw5ZAjatt}*k?5e+_;Qs6a9|UGc zo?Wu~ErG%MWJO5J)?iI>Vf&P4Hu3 zA>y^B>!Y)mAzm2o<%CDGsb-^sP{e( z_5gw{D!tR?-K4}8w?=_O_i3SV6n#IaKtT%UcSurQ0G_zKTr5|bA_nn;n{e!5HXOf7 z2E0SNC@*rpSkPnDw(9*P4PI>}l$MEm_m;;Cd05|*IQ=#)?spa$ewPF;K{LH765O0p zq^@3TGb(V~(@Q{KH0@?0d`dZFDy}P)7|+?Y?7x)5&@gKHTN^Vz2{S=yZcB&z7Lma zSX|w~kzXHaRmwKIzb)4!mpeD3nv&dg@lrjCr6xG zF1G}gyAP?2`V^w1W;~MG|2`-zlEm_|N@8(NVN!pEG*!oD-+lhwBYkYH2PFM@MJ3Yg z92<$i2Sc#Wn%BC!aMq$HfJ9!wC7=3d84@`|~y;&{j$I2HljX>S{~hw?RaX< zbY~Z#vVVJgoWDqL$_p^I^C%Vn=}4r_8eV-Dut{2GD3%T$Q}NeEe?D0E8;Np^B^Zcp zpZa2v;*g~*=4C6Bd|0fK_h(_P(K7L>g(j|!p_{2Kf$8qpe$)ucI)DfDkXdW5iQ@P1 z@71H(;`*o0^^&Bze;WbyCtv^33xkn3ts6vQ^GYP~xsJiYUTdIzP4~7U9rbkFO)rNME3Ac2th0ij1&%PoEws1C|RHMpr z)Pz&_N^idX{B-IvC#@ma_JoSJo*Q%*ZnRI}$5*%V0#(cZrLUb!+wWKj7^BJ5luapY zatn0ln6JF-DK2k4r8IhiM8aiY?^|}@1R1WumV2XitM`v@B~Pv*yZ zt$f%c>Rsgx=A^$|AlVK^1Nq`~fiw;yy3ELAucT&;)u*%JWXqB${T`=JT^X&+qq0c5 zV|&I>ro#tQYa7S^gT1$ks-s)ib%R6j5Zoa^aCZ&v5ZqmZyF+jY9^4&5aCdjtpaXYz zw^RIc&o$@TZOwn5i*tAO)kvc;sH)!k$f)Yy^F8lU1fVrtl%X`Dt!aA^W#FCV*K^eW ztzp>I-Quz`8#9;`pG82e*1-uycem8&QO~##6i6xxB!6nr^oH>gZ~tjoTp^v|r068V zjd!}*xMnBEEqtly(9BW!@gE>JWi9+T<&FdFFm2E~%=c+~3Deu2_sR&H({&Jlfap_-#x!I%{=8L5DNt%#sX5p0nqJA4D}zJ zp&${dgrp+d=|Z>S=3}QPGGBP&=#PhPJYo!PBV(>`WQN+X!Qy4Uxc=+(;nA(%uA#%O ze^7Pn-@FGpCw>)}~Zil%>rf zdo>7YBv5zW&JLq;{oV=|=k236BgxgWkC#m?OFe!Zgoh$bxPBgRB5-0n6vCA&E!*C5tus!vW4f+>ewe?~D^y*KvQ>eM~ zopUDc-{n?YvD3I5=_XK}T=CQ*%@Z`+`NpPUklf80H5 zr+OmR@)~u^FI|{}o=Ny8KF56OLU`F!N;;z;%58sQn>Y;hAmXs|m59v>xyHSB@tJ!~hH%l+_o?KO zjOOCgcZ-UHol7+bA<^X-Bc`b8Vpdv{nI#&3$PL~cFO6Pd;t5f_pWcxlJ94DQGbQp3 z6Ct9Uz0BTX2$|EFbd-UHTW(K2ZNps|OE!~H zQM4$PoO_$V7!AytHS6hSsu2HrYB&fVq(73xIqZhiVDE46(YqO20m-+4*5iYxGY56= z1~t~8P_AWZdr1_F(rhBv{G`EM9yU)!``}ob+vSLOI=2nhLZBRy8ZC+1EAx#z9=8X% zXybRQyXB5dAJ;SA$I)zj-6IxRl}d=hqCT52HYKdV=i?qY!)<_mW2&^3sZ_{^@)p$# z#XS0N-amA_sT8Z0+&mNo-PA=Rp+a`7-KzQflL;40Rts$A72~PkSH?qXh%O8a+|J)) z2)l4;ybdc}YfqDBnxKNY z%;?iBIwhC5M%MU#7PE0Vy$F?c~$BA9B5@OCb+ z6gJJ9(2I3*fc;r;jT;f{6e7$Ylnw6`03Xe0E#;E>fMIF#;`qCD?jMQq*^P z<}0Yh;#X(-&gETLbV7SRn$2|!#0%P?9jgO=q&5=9&0oTm2HA9&)ct|sTjzP0JH&s9`t~*-f=j*`N3a<#7alsXDG@uk4^|oPusal7G9Nva zS3mAy8u%7Oa@m4PBn&;~?-3I*>2G~F9)%A0y~~SqOYC2k0I<}ySa&tDYjZ?TZYZJd*{v0w+fgPg1(R`F@MD2J$B)0v|s7=AZ zQ&ukBzmUGy7U-pNt;!sAY3O1Ob$wPfP!ErHW{Lx8_*aT;wF!exLG1|yJIWk5{bJ-}K!%g%Es8niOd9f(@y5&NnO5$6yR04^bR{Ow4b3k<9%5 z+AV$GX@(W>>U&^BpqNRm9Fk0cLsKdqxrBO&+8qp+w#z^LJ3@Uv$8QBfkozR`D_@Jv zwecs#CoNQA`*QI6iayDZ$(T^BOr^|}I|e8n7$a*ZBWu1bjj2Up z$`^uQ_j)3;lnp*6Cgi1G1Kn<4n38xfWcwsTfIGHCW3Fxp>P3T?&c9rK8{D+oWi9G? zvd=-OB3l)U1%yv1(2xn=`QmY^?zcTk)izeqLL8(&o!jqtG|){)EY1Uzbx4WA_vsP>d2S(7faJADQA zz@7)2~NBRxAIi|gh=_sxL-3a5AuP|%(5B# z$sax}j40o+Y~5F|f;Is>ns1!0d$x*V+T{>hIuA8yy7=@!_8RP>{8wONbd@D<-s~xy19Kk)?($@lFaM%%*zeu!1Ia6>0sT)hY*ug`pCipu1_ zql08;Miv;SCFR>YbxUjOIzn8+t8C-Su6PDM`P*0zioy42eD0z`^`nK<`-iwsX*@W# zUvN+jt`pH}&E$9h2u>LMG`FLb8o~ylLKdszp$xeBsDQ3NmR)#$6>(?_Uf=K7)Y*$9 zdDq=Bq&bn-GEX0R!{7?ObT3W<7eC8NHOyuDW?b zxNq&lM;KrS`{VGzxu>@Qym-tNr)_4}E=|=B{&NmzneR4VuoN0hzjoRW;y-P3+;ZO5 zdAK(`HoQ82Ccja)i=g8&j}w`%TSa~I))C2v@GRvtB3TovimVbbGr*A140WI#I1sE0 zK1!ZhcnZ>@1n)e7F0Z z%6~l5*dk%J2sqpV(}!bT#bG1W9r5$m}$*Akh5Vc$Dj;bZKdpY!#|rV}S9(x)FLC(7eX# zcEGaGU|pKblvBLfdmbEe7!9<4&+mgcFsM3iht(r87j4{ATcM}Hh&URdq6Bt3FmlbZ z!Ey972ZMfKt~ykJ0{*C1M6Zu=|1mW|*cGr_vyfGtWa2}xGA8((k!TpRwH+79l&Ju= zAv;PaqzC@l6l+qn`-pYTD~HiVuCU+5Z=WP)2I5WY=hm&1GehMnx)DsJm6Eg)j&h`l zhj@)fW2e!SXu+s~?A?umjj@i~z|S#K;qeptHre z_HRs|qytnXZ4d@je|zx$*Cs9eFfmwcsR}$ zPV!e(S#I!gZLt%Lh+D`I4xI!qS@tiOi`eZ>7cz^2SDpXf)$So zn`?oO3r*ZVt02NCuW;9NN1@fyyF~hCBMzZv^Yo5ymaR4eA8Ngj)Og)#F|()Oo#~?S zFh5&+CPPXxgBTeEbm4}U1*(dg3DG5V2kUyze2-7(n6xrn`OabWP;M&drb3{;g1kgq z_@C|*;neC1W*-*4`{9zO_1x@MNkPvseE<;qfyVLuW{v=u-5MWK*?T|iwhB4i5WJ{+ zv=E$O;Z!@8n{(x-g_;i+nDNM>Gb#0+V`{i0nZC$a`qiS>874m@tQ6i!H6cHaxu^BX&9~zOV-X@haVkng zp^lciMeY>{fR@NZbo+EVwZc7blXr`h5c$XvA7YI|>41wb;$q79KU(;j`@iT2ZL_C^ zqpv!XrH!CuKD>jlJ^8}Xl4`ie%>UAjsY>XKDK4oS!I&`v`3|C)!PNW zj(s=CKg9~{!*D@EXFniVm&IJg&tt&kq+dnqXOTJ-NZT<5-3If&+A?kz1LM6*{5IaL zwy*~0jvPg0Sy`ik5CITS%7iQC?w$1dwg=Z*AzpkF5*v9IIvxa z6z!Zs^0xc7cV!CwZ*-w=4~2jm&3W==B%Itkc#vmaTW6&nB<8iaYnyb;UkW=S7Jx~N zmlLEy6WZKS|F^gPcNh{t$yU-6L{6ERcK?%(4){5oz?+fo`@pCFh(HpNp#VlYvKC`L z(*JCvdk35n+B=fi{{~_H`}z3By&Wj-Ea5)bU)T9xz~R6BLjfE( zkYbnS4)ecbqyNoHCjkC-zFwVhkbtuma{^Lmj%&O>LDCF_V|3xAHFADh|*U3K-(f@B1dU6vV<;lm=IMJ?n z+DLoukLy?c`t-HYKO)L$8NIpk0G+GO0Q1xJoNkrgx?u9zgYcVPZ!u3W1Y6#W*Ck9WL;am!e-8m@{GnS z|6eY>Ez2qXmVesxCjZT*cMgP8WB(sGNUGlV-#W9r8h5Gh%_ej73Pej;Od zZuR)vKdg27kG=G+oY8kk9Nhi4*O1?eyYd%m#pL#9@>4AzJ%h7i z6X*5UyF|Fu*jc|=dmQa%MwA3R1Q4_N+NJ@)Z9v%b5v}}#c6f!DF<{*D=rzX$-QsXQ zkg3<2ls?KUS?h{(;<1{Pv^)*n!F#WSJ-^U#7?#2NaC(E{DhYia2A!_D*mGNLa1I8) z5>yqnKsCOvifspNx7`F2MvAwJMr`{Yapsww{}h$=H(0^%8Z>v<84Dw0SI7PaJaNl1 zWtOe#8MPj9IslqLeiFGj8d`3bW2RkX&qu3{!``hZ4K{uY)@ebqNt3Ec!saMd5_~b6d-q3EsI(Dm4jA}bZKX5~3*M11M zZ$hn9h_82D%*G8?mgy9bmM}YRZQm@}L^<)Kxag#^s57m^f4$fp#vF}8jdu$ckq6r4 zjBXP>K3U9qYy<1#@(J%RZt^#zPNQWg$o;UCNsW4r>l*jqML{Bwu5{P&r$Y}qwdzn! z{mM@uDhtMIMAN!ar0vB%Q7e(FHWcC?1m#mh;#;Hem-0=x4p`B?(tuz4jXmTf93%!hTG;>zOGUEZCJrc<{* zB1VUz^+^GUKy+qp_iB^2$3D=~C7p?pD;$znb}-o-1#<01w3`6-{_Px&03h*`@^7WF zdlvTPElUSd;}8GjEUm`B20on;`{&zrL+j*F>~mnP2`y>m$rDR>m9@%FVVmsy1gc_` z!9ZzJCV_@-M~G~$jk1LOjhqC^h%rz1mv9J)L@vhSKEWRcGWjtSKF8p)Sy8zxCx4kO z(;{{;9w0g$OQs&@nbW}VwoTj`yZ%MNd2FhDg)xw*+86$V_s+eaQ@ih<0=`01z6lX$@P;<^lVM+YmG|+1nj5$Kl~@t583oE@E#_R2fkB(~d=uNf6z~9{ zz*X8X6*z3>RA%)VR+CySJYlV18*D%;&(?6P7*Oy1hMc;}XsC;Uo4wL(PJ3 zsVU-0Y2iG7GS{GYTb7ADYtcfN<)vt_tQ>#_M+oaSFA=y9RcTW<2>}Jqv0_Ba=^LKV zc!7B~>i&qY`z#vaWiy(j{~_~vthp`4>G07}sY?~8XrsLmmV)_I%3dx;E&WgHEs=sr zo1HJ|lP+(r7`|EqM)TU!`N2X(%k;f(_Le7!*PWJaf^r`(-{=atc;+2XKu-b=#^&sK z9ys_2q|2vO_eT-3?;j4t%TXH;o|rdJc-0r-^M|KBvgSc4+w`I^cYCb%bTs+Uqe)bzx_oD4=joAQ2Z{pzD0E~wXz(d@`jVHjz zaf9IP+^>&K4`3-2LJ(h>N)W3kGd~x@em0Cqt!LPuwVEFYwzq+3GmT51J?2Wf1Tp}U ztLL(x%=<^;4CAC2Dm(f4Z#aBnk-^fZjt<%NvTP3P#?ylpECerQ_I}lKvgu9e>4q{H zdjYnxaH$@+(;DOGP#_E3+G?RG;j;N2040j!g&GZw^f~;-*ZzYjAiMOEo#*x+xE%%R`rnVRsc}VU#F#!U_VErh6VL}Hw!&Jr0igA=LOU6bG1u2< zD?8o{_$tWxb+=Z?fu(3gJKiCrMer*(%MNCxV)g#CH%pSu#Y_>o7xk%7nIA%cOVBN= z!_88`>RHv$80?Ma8P!|v!4C60eTMN}2xfYl^kqV6xU}_VZj|1(Q=BP?kXVVcW)Aw{ zm|O3=T#FM6qDbr|CJRXj)v2wZPYLcVU(SRM>*TiyIPHZOJKCNfORun| z5m&<-kt`b2?a1k85RZFakh;=SHIhX;;1}mP17^Cbu@dNKIhMR;-3@J=+d2{84410F=stzw+lW`w68T+qV*v$d%L<}IMDYI ztzF$?a(~jTlV<#?K>nb`oxLi0UxIFdJbNh|F3^+2$-UhO*!;*(V2i}&=;BC3-kV-I zHJ&WyV$$x!?*jjwE1nYYnk|N7*LZJ|bKPpjwfn;#8LkWIP`cW62_N*Nd=d}7;(fS^ zdAQk|4I~&yjvY#%6e0~m)JNsdZT@XgXL;&l=-elk}@O@XiIxo1-;-=0hou>92=1L!_1-dUxuvsQ%wSPGXNc)n!( z4_6|ak#qx}ZGZ>DAh2(<&i6H{_(J$bS`NYIUwb^3t2HC0N zDqm|65bVrix?*vWL1u^LqkA+jftbn?e_E*Vhbln%tkBg7C7>$%%WP7){vr&Q*TqK} zpL_HNfaEm48d#qy5Uk|{`QObx;hzNr1H8F-Jhk!%X;((@NlA%!VBz}RjdDFhb=gsT z_e!#t0Q4tzx9i5C%O{!fG*t1-U_qNQo&i9x1~mUl3nGoq&7`<@ zupFOG7?;Z-!LMK6^()5A`fx<{b~_ye6oI?C@(=V0lJ0jx{tMd?>;*9o%TD9VIqBBp zC4{N?elQ21N{H)&6>VuxMinfkRX^1g9?(@mnx0tAC4l{Z-3&tfMR;imvoP*{?2ZTo zQLB1JvzxE{vhhZ5*S985+(mR5@% zy@&F?638BM+KH%(;{1j>OV|CDL~TB-l9L@&lfCk#!9me<0qzU5U#;UZM1cK6V7{nsWB(p;3>F{LL3#& z&BBkfG+7lK;~MaGINl}+DZ{X3Uc(zRp4NCtLyJ=)Pr6bj5`wN- z(zif~H!5t#Ng`LxWPAZerxu~|Ls0LcW`EX$A?Y2vnb86qbw$B46Eo$!LHrnsifOas zp*5Y?2*N{Yvd)%DF#=l4?7OGRuQ>1M{MWCaofMuJ%i@-yv z0k7~2cE`21a8NUcMgkt_^sFmY*irQk3`Gtka>U+wakfF2eZcnX>z=pU9L>jb6C|?= zeqMMZT&Z>&kT(ZFZsqq}PRHc>1;{sxs>#aMCPw*qHFC`+8v%ogpLET3a`4PJi6SOB zZi@4CK2buF{1r0I;5`!2-z8NWrL3dF)P`!7gyq*4r5KB49zOfw?t@t%mR~}}WEt?e zW}sVtWQUCScm_a~2L5$ct8}X^ULbqiRH1nIQbDs(>+`kEdW-=sklr|3LCkk{d8WQUDK%Ub0^BC0d!C?E zqD7nWlE2rpcrv*vha(>4jtmicvRnk4klb0N3E@tnxZ(zI?vm@cIb0sQ%GeAcIMmCFE7<@*SQc{)H{90 z?go6(LkV}l!LLJ-j^G|cB5Kz#Qhd=wEN=1K-9uVmcA!htO_5 zCoSCmo=3V33wZ)L*VAy6jQvnIN0i9GlYCL6Ny;t%LowBakz}$!&4*jUkSS@ZF+NO$ z&hbCXM=qTbf$uRY{3LM@OZ``;@UfXS)^{LNf*o`{Mzx0Q&NXc=sElWZltMRpOBx=l zjEJ9{PnV_QU6nC#BGp1QDh<$;KkG*RR`nU7B2`L@*N;Xk`PF)HV9Dil5LfcE#DrGl zF7-^Xqr0R8+7OZHyl)6G_Uf^{DvfQ5wgm&DDrb zon+$l9t@N2<%>Eq$gXsOJo)kBk}6BpkF6uI#CIu3^*ds|Y>7!+SnDjW?<``z7`BpM z+E(a*1>%G~RL8^gEF@o^%}jV)vcqkm|r#{+wV$o zBr12q)Tb3{j+(RImYnuu#~d_y7j;?Sx~5s__U_JqyT!b#WudyH$}hb`aL;h|gO5JQ z^6Fq}~BCW2we} zd4xZX=4+a?he#Xxa3V!+;60+ny>B=x>=rbF8sjwfkF3K$gi z?MZXl5m`srN(~SUdjGsRe-{choUkqU%VkYvPCQ%=m)FI?8snn>)JOgybz1jYypR8? zBvJQ6=8X86TJ3M8DoZ!N0tc&EJjcyxTjU+;Qq8)Gr~dAy3X*a5j@u{#Mm&jt-^xWp z8h_l(*e%w_j7#m!gvvB)r)<1^Ul1SS!ol#K&L!hM<96C}~r`S-DyBEK|7`}0c${hRb#n{q~zd`pMpenNwzQb-FIa_CPP zF}D6zuNL)lld2akCdfr&>M<}I*6I!D9GUj)ftAANRfi{M`P|nf!3@UY*0KIFXy%Ix zN$K1bFt)|&b7HxH&g3Uy|Jo_ERc|?HpaN*-yV}?6Zl1u?G1nqt1&5t**(xZ%iRw!Z z#nYmF-dENyQoNtL{SB!P4LLk6#z$TFddJH#xE6V~HTaV|x4Q`%BWs#J19i7v^lw-O?WdLeuwm*XD82L-UF zic)}X42CUjwW`(-t%e&!k=pG_vN+Y$%V(WbIS?LOg!iORrP;1$j3Px*AYEU{sb;4Z zEro4#-es{e)*GcX`Lx5&6Ny2C(pz7q+wRL$ZHshD&?Uv*C5WI&P;r(;lghDEm}4F` z;75MKeEmLTtFK}6KCjmloWu2-X3{9f zPcTdsQM3&J*rlKS8FbZo(N$oWrDqk(io7qu<`zUr6`-x2bnB*z^=lg$$uT%58>bgRL z4piu-^i~jtGI(*d|L9gMlr3>br=qPBVh~hR*fOO=F-5sP-N6kF)3v4<-BG|oAQ}TQ zb=&3cFk%$)rgQTK#fXCT8O%8-Chzc(`3z_EHX;R7!XPlH{>r*P z+32nEGYQ6^{x7Va^|!i1JZ=NycRromKo9 z*w+t;YLpu%=s6g?I&5^Py6_EM-HRdp<(WTp#BTvtF1ZDd1M?*pS6|^BzazCu>3S!V ze;Ri(M?KF5)k^_xt}|E=J?R}LgqP3JV*{1!MQb9qB1*AiSf+5AVJ!}hC6-!T8IkS% z)HXGyewy|VlX(WBF3fFA9wHoHXmUd=BKYbXeaopLL$TgOiT*KdH&$Dy&M1k}gFoi| zW!l~&?~=TlqsD9xquFiZIZky_^Mjgw$n30hU)W8dpf3Mc@fjF4{W&Ps1H_K`{A5F!Oh8wp-3gIX`Q zSkPsXn0hl1OSfK*quO6E^fv@%E8Z=rPOcYYgg3tb@T@2G6EkLNJb0tux`|R!hz=_X z9haVsR;~xBO8EzIwNz813Cv9ir@FlJV8Hkq(vhYGn?<%j64p1GE{;&Row0Q8{ZI{7 zGH%$9e7lDu$vJoEilWj{iLaqJ(}rAc(b5lyxFToz0aj`QhEm%54lUu2x@WSv6BsOh zlsx};5<`D)9{T$H=%YxjRYV0 zmtE^o5WcUif~c)tpWx3E8=j}5X!#FX*3<5mg$V8mhQurh>G80WS3zBw9))ZLS-unp4wFvzV{ZIDbOF5n8I zn8$()lVy$MIj&4vOrK#QHf~IxJr0<1@4GWpKY7%xzC!&r zg<4VCNt|-k`bdyJ{{WP9Y7(;zaBEb1b)uEMl|@wC3)0cd=249~YvM!_W5%u4eHuP@Qx0@8#4 z7&&A5%H@V@&uv4P$b?G$JRAy_L#Z$V6I>{Rghx#JVjGtWopOnW%uI|}I1Xtn zwJGnqCHhFm4z1=dAf2ab*}XvetN-UZr_}~Gc>OcG-#SZ|oi``SR4REZp09l}sm3A@ z;pF5Fy$g1%O0n=CDXAbZYg7U82DQe|0e+>l)^PbE6jKMahr$9NK!%_$@DZHT^*aOveDHxDMk`8< z5CdhGUeqnzH?{UQhRcbR#cuNF?u9`f=}e3B-(Hre0rXjw;}OIh<33G76bqn2=tM9F z)Hpaaj24@l7(oZTGT5#n7?#1aWxnZb(oRRHSfFZvJ6WBcBQ4%N2ozB0z!^+?4CKUC z(heuGg_!vC!r#Thed~+7{c=PomgZkQ{+ZFmzp;|oGN|J2IsfWC_g?Tld>gA7Gpo_D zIt{qnez8d#OS-f5(!s6J%Xk!lNVSP{Vorx}R}FM3uAU$`qO0Z2nQP5j*Nj+{1hTLA z!S1=U-?^bRkBViBx>eTHb!9(eQlmC57{QO`Wi9)5|BskfbGF}OVNpD8=v+ikS*QA04Y;j z{ppNIpEIoZBaorhO)u0R2&7#zn_9!T79i)2n#a^|rMCNFVsm-s(HIzNsa5R#Kmqs5 zQXrGma0mn(7pRX7PI30nbd?N2qsQ)72c)yIYrahvq8}7p*N;>pn=i+>jrF}DY)nFT zkAMP{t}#TdP&*`Ji>-%GJ@cu`{^a|X7!%!5Q}Zk5;Pdd*UOVBZA;PNo7lg12$7_rhn(h zfx!#XTy@DnJhUDB`Ar;pSgrS^ZxDvg)hh!O)_YQz%f9vy7FdqY<&?mPuq}Y{o}JUg zLAvqF>j!>tnyj#MX!7&;e!sx5o+1inxJr%P`T6gGs<;7b2w0Ko>sbW2;c-U&CC(K6 zeVo*3u^TXAB@}Y$O!Z;@ig{n~>hzICcImawYOQZe@AEPzocGW%bkcz1YgTP079^c2s3BxKYn}GAfm>~Zg!)Un!kC>E}sxx&?~G*4+geIL)xrU!-`l0gb$`vf2K#K4k7krOwP;`L*``^7$^?4vkzw<7%475+xK7Pnux<3g)`A`glw)I`D8!Vd{PcGY9S1>A>pLOpl?h%YYrw%MA)o8m^t zrqTsNd2yYF>=2c1(|1*-8aCbTlsM(?goLu*_VR( zy5~~A8k5`24a37dF6s%Go$tHd?B?{ljg|C^Hq)1%z6&rR)P(99;XJ@)-|MS*2o)j_ z3|j?e5I*K4&CZ70Xn$8`F{A7nboetAZ%k&!Rt#hFd?$47(kcDZ*js-b+pW_j!HXH$ zqvaBhS~!uDx6T(?n4vo$G*ugMF|WG6({ru%-X`hS_kN9|!l@)>7-E7PHUx8(|`4d>IT}q$5EWm>qwI49-chR3l$JN-E~7vY&kP@^#Hzdr2El5 z!)%w(j&L_Zv+qEGjRr*TSuq%+G z5ka^f8@#lCZLZ#J?$7j#^gcI~gm$I=ikvDY~ z$hY<H_)y??lRej0{LteM~LA}!o zNnT%U*I$NUk#9+7W}X-pG}6q?tI)s@U&(2~dM)P3pig}99>escp0E=HX=ocwH~(%c zyxH@faAn;(pZBnMtP)qx6BZ5m49PY7zz>mL5Dsn9yz7aG?Qs^7aG!V8B=guJ>}oHb!aEXJFZf>A1{I-9hKA_C2zK1H6&QE?7{XZk21F~bytT_9VzyGifOt6s|gthqyFEG2Xrh=oJU6h!|pJqUxfoSs98}F&X*rh;m z+4WSnlK^+&Wr%ZSSCgl-xa4KRhOb#qN4Hs-?`KpOrW?GbZ)$wUCanc)@j;v0>fqZ} z$lV|^)2ki%8(UsE9N1qbB<;KN6VfcBP;E#q-uGnBqs10(^MtZw&O;Vh@5~lE>1Cgr zoMh~Islb1HC&_~S{EwF&Qt+XuUeft<`H!Cd%h5Z~!!F6y;+9<3%j?}r{$2(ulIUdN_QQnU(+cupbD)sidVnPn9yo|1}W*!wH68zl&hn;%o6HueLr;CQ^-(P?ZDlo4tU8RA4KW;H$z_^j#E}M`3{jzfv z0p{WSeLY6OKR*utS{wiN84;!frav6yQpl?Mx65P${cUwpgEW%f356x3$p!`_bPb0T%Iqmy3qO|33y#IE~>v)8{pRv)?BRfS$PRzsmKn@&k`Q zt@k(FvuJJgu5kDqt`kkNM--TJI(a~Pb<8{=^H|3C?&V|&r$d=Vs$JbWhlg4eHbu>Y zYe^k>vnZtZ<1zcgP4nqqrq2sp3dCYhzEq-L-zK+U3efz_YKC)M>?fFuY8NgbxVLNg z5Hfdx;PZTqNvm1uXVStH3$TdeNh-ugkM%z1P^GICj7N_xD{Be-89pz5P*vFUx0X#)kQ}J4E zeSQEQu@AYjGqV%}8CWRl%c=fYxR<>NRKEam#sx4JJi<|NG$K)d+|zE3mJjb=@}g0s zk1pb=h4`of2!(To9xFYRyNd^Wdv%Qao3dyOBSvu>U?vVdV+Rd?snP%)onhBvmXhEHRqJjYB3( z#cFRi)2HqNuCV+87?L8T`}dA62_*q=McIb^QBqi4$L)Zt&-CXWV<-s<<+G=>u3T2- z5bgNN^MhiPj+fbPqV8bNFs+wmf4N%aPt3!eKIb|WZ|i&AmOJk2eg5b+e&>-UitLO} z1$J{|!Sp&+0niB8l4{c6y?@dSpLzyaj29gbuet6U8hTuI%jnH#9LA>;b$zVj30En& zMElq`=5z=azY*-zcc}Wh*k5RO+Mj?*o6q=LAFghtE`U`vLyAplDqxBsy zN=Ysd42X=kGR)d!FP4_+T%7&vE8lRB*+U7z(%+c3L)$~tne_C&`ut$K>7K%DL2k9w zmY%Uc&-k@9F41(fpnkQL$y`baIkAHfx;V+{fE`8OEE4ZFSa~VK>tTuw1KdWdIi+p! zoO8S3)%zQ$hV%JqdZFS%VkC6V1|%*N6a{-9^7^>?y0_NGJ5%hzE&;P{eXG-IzN=1qHcELo#|fs?Mf~nmYM{3-EHm%qhWF`hZ~B6+3G|I!SyR;pvyKVb-J+rgq5N| zSLh|vyJ6vhce*Y@BB%e;4*`Y!-E_nF#JOiW_$`>5Q!!u{t1#83ZDE);MHDqe84~*H zklIk_!Eyi7YD0*bChC28AzgCsg(JY$tsGE?ObZI0x9ENmo=IeJ=_{C=4< zapnS3=p(Rwog42t9RU~4zxc0S1g0^KL`&Nuhwvu)sbK0po& ziG|lpJv-jepnK$n^s-|U{8pRH`ML8H2tSh}quIpUecXlYU1JV`&*vGy<-9L>mmr|y z+RFC~Q{j9Dt-&m~x7VzoBpP^GLi%t>qo)5H0Wt`vP#**-?)8@|sYgfiM}?XR6ox`6dGugp~p*bBM?}n^s*MS8S{u(-NG)g)N?TDhPWFxb7S+obBY2+y0oip=x(_!m)b&I?2wi@ z<`lb`7iq|^Lq0fd30I2Xr^pIO)x*(-q+PPDddLeRmC97u^^~qQkqIAbn zyhSGAHqJX91vu&_ls9p!^Asm<5c14Ri4fw&)f^{#Oz19a|V8367`J4Q5tEHS#S_ ztH*~Z!=bJuQL!910V2{(-@IWhSa4t93@G`rZArLw^SVBcXb_3(p^Aj_J!4r(rniiR z)U2lSZ@*K|c?LYN5uUw#j*x92p;Lu32XmKa*kKE1TtYpM&1i&b$*_S41#BCfqj4@6 zzycvH{r<}sQec4{I;Gz?xrrM-doPqA@HEEP<-8AUsx)aP71G_e#6+&?`WE_P}gZ{ zd-&^8eJt8K#T%lb5P{2V91PFTHr(vfPD{-85oxnOJ}K+h;{SrqdWp9LZ7uues@o}J z{e&|}4Fh$Ulm9zCYY{`r>D_1g3+Nn&RTBXN{A_&s6F+O2<@_nM1OvqPA^7EH z{G5`YqS|YXoR5~{vh&PpiZpXr*}twrwb zg^37^exZUQZ!#RW2Eoz6`nX1;#LSjK|bj%Y2ehmyT2-|LTA613{}+oerFPkyzFpuLiu6n zh$-q-h0yW&LJi2+hYWWpjangv^CQ&ecdnAigZxqOepFTAC7f_4SWb-Lc1ix-`4^S^ z;Zn09C9zvxQ^&aZo`G4X?-tibs?GX(9^Q{60dcr1XWQ`c{YfmNHZ@DHmr1r%Z^S&w zot|+Ni`An#)ldbajH3*T(0zHPpi@l+*$7RMu2YTu81+$N=dL?^@qq7z>5!F8__A$8QS&Rs-RZkvLmu03 zRyV-x*J^uBGYF}7dR&7jKD)d8`2_7h@;c zCY)Ya4f$sgg<0V%Yg(4Rj$w6q%Vebae1A5-84q*-Hy2eq-1I&QPc*1Vr&+9W-#!M9 zN5b>^FiDLcyaXS~VJSgY*7trroUVvQGufXOzo(Ep2_Tl%^L zH@IcPa=hr;koCFkNKZMAx}QRQ=N`wHA=e^iCBd&J1XeQsS%D9=CN-w%T=4BT%n&yV zJ|v7c-_VX{>y^}V=@in|0pl z&;}#1UY1c^(u`93HIXhq=L|;`t7xlR8^rOn6o5A0c7@_uRZ=g6Is~O3K>xsBr34nN z41yCqyf_+qBXMKB4pYK$XqDo;vV#L1vj+V+rPgOAwEoyT-Q6DK#fkzst(Tpc9|EIQ zo@l#`z|H=myb)BV57^Xbl~tlEa#_jXFuKpe;7-4VE~+TGn@2P;68L+PWcCt}?uhA{5%*`jMGjnHFDB;^43z^I|a%@*mNV+~CSU5;2c^^sxWf_7O{ z8oWti2Nov|*vcDaFORL)pMyCox8hwDqDFOsHH>wF-nW23L`4oNp`y|!%samod{NPy zayxBrP+VM%7C&sk0I`qHtO6i(MhBao{WREJtx|@$C|>C$aHbpc08`6(mN(=IO?att zQPhj{q_gqr3UHM=Jp*3H@!rsDxg{yj@r}@G>poU>ZAibNtz{2bTWp+z*6Ae~HChhx zwHpwGwWNda%pp%Xf&2ZDoTMvtavS$1TEp`mocR?+-O&TR>fx_b%Tp$O*JWYvZpYIUx1i+(BBW$z?s8^srR zjE0Io@xVudAj9zLF9&U(%MyOC4gW<+!mi95pd&AvG@^=%5Ho zWUYqb)!;FxzY9Lw{O)ntUC1upsYQgq=N?cspzrR?ZItv1yi#P&)OqW)4JUI5LKTcE z$rYqx_?P@!85 zP9a3&=M{uzzi+M}1oC{##W&@yoCRA#b2rfC*&k%e8`Vq z5{kIAqw(}up>&#bUV0YwYe(5beEq7HvdZ-1-FY=j{KTUrA7=Yt*T=?fv~|>ixMgSA zzcMY2Mj$!D*Z{?RvW2koO2^J_q_rE#-uL&z9QGuE5k?IO>N4%V_ye;I1luv zP7(b&{6a7Cx$8BZ7H~|*%Q1UsvhBxpc=#3F%9QsIBXK24g(%;RnqHvWMZnZu9ZXG1KFgBt_+nahgP1aqw6U;Oo;{ zJh#G;7++BI-}~c+JJ@`Jc2@3WR|Jjw7+f#*7V1>LzL#|P@WEEjQ$^b$?XEM1OtoPK zFFPeXc6goX-al{znaH@T!mQZ1&2Y<-A13Ild4-}fYql60zG4b;7R^TU{7mJUX=Mvq z0IqNhbBS!G!EFp1R zhvd^z(V*?D4L1AW7KVw1Yt@K8`a6+|L3lChCCpqy|G&`GwZG9+Uzzuw8HUenx2roC z?uM){__3({xEHo|uc*?xXicQD{H43TDR%qv?p|N8#={wn+1>$RrfQbrR#HUT@W0X2 zHa7@gQ<-758oBOAzuRK!twuOr!=pYyE_Rw82@Al{kzO4=A*F*P1 zK}39#KgrSD_}vWopiU=fPJV^16Pgho&+$z$yZ-2x$`;{D-O5~d|2gE>3baBc>&hs$CeyREBO`{8xI;JKWX+7|v=%?=mGEQpbe)c#rT)SZ`-Yw#$FRxH zEw^2|IX`n|14L5CX*tp3SYPchZoW5J1Bx~#z))k*xa$9jXjg1OcjBJ-TD4R@y`qe2 zj4f0Sa7yPUSeKjE#U9mdlh+9WcLKnl%Ba*DiWSf&s5Dt?-Q7o$sS_#&Ois#?>yO>w zeIp3R_7B6PO;Kz7I9h)Vk0|J5Zo4w#RxSkh(iy>_K9;6<_^G@&cS&R zwKfy(KuwmrEhmo}dHSSLQ+tp&g4T6YtPDjPr|E+a86Ng=pv=oiN92>rBe3Y>!YPtcqrR+^2x^K zpVnoMmLCn#NRV(4%ZIfv)ko1hE8 zbFxySu5N8?nZ6!Gc2corDyGF4`WI=c^X@Ow6#04YKS)z0B*lM_rp*B`x}|)XjPDDz z%H;7|C^1R!4gr_xW1W)_eberVJW;X|Tx*G|I3*Z8MuuLuwg4qci%z}+neWgpZY-1zg9x&yu`pVQaWeU6DkcnI?T3j{UyJ|B6n zTVY!?V3p3*y(WxrYcW`793JXU9>#N*9dfxJ$%=K}oUdJ(cf`nP4Ip40sUia!Kc~LG z)Ly^bn__FFB49n21Xmeb-$5SClbF@skDYwebsYG@g<|xsCwjU+L#RG4RJTR(06WyX z&LA0RvM5LbCM#u0CEMXR{u^;8nwqx(chI%#q_J}ogFJxWf#{n^Hy-56)5>&4;x!({ z`pe8k9yAaTC*T#R=cXu?#~;2kF?HN5Buy(iJF2JzR?+u*83ozWtitUkNnOMG_xcrI{-yaAdaTug13e?zKLMeZ5OULGD0ph}U@aNkE z)0hXx>I-_Ja`34_g5*rKte2xEV-imx-iCC;dk6`DZ`oJ}-T=#P1RaE zmBEA13PC{o-Slt=-Ea1ikRB4utRbLszhhlSX zoD_SWx{rSTsud5-VL$~ICgXP&DNYqJ5Ux z7W7DWfJ8&{5Y zN3z?`I07`S*Z7|Nf^QLEX0!%$@L#J3xSZuU$ZyH=OX=^YMlVA>&bspF-JU;Z?i}H! zfI}#W?exZkvLvC&+oQK;#q+cY-v5w_eo1x!NZ3}M}ibTpI<^JrwX z6|AS3k(w-!tP{2`_~RZE^G?L(fx*g74%1+bb)`8H0f!wVL@d4+!(>%-b13|3L+L+# ziqVt_jYc~6$bjbh?|UYQMkX*pdIhJe)9XXM5E`hN zVLgP~ncaDRR7YJD6Lobmhew});{_|ra5V9&dYhbhxYm#5SQu%TUI>x#LFA7s+i;IY zLyunSE@CLok?!2G_`HdDAhL8p_fJ7X^RP65T|>M}b(pZv2pLu$RM%-4vli)H1^UIa zrdcgcul6l6ZKao41808P2p}`J&|=T{vt=J}+Li$Oa-pvT;kL;jWh4{LnMeu^6Y#<) zWfj&{DY4_PrtrN|T-Vzb0|*Hz(}GGJ=JLW#Z}*T#m_O46E~($ZjgvvJ0ZE7!f|(Z& zi$bdGI<0(Gl9nN?ICKyHP=Hx^-r35)Ugu=04Zh*ma^{E080{Z#npuPUm1X&n3^i0R zAtD0K#;a?hkNP8u=H;vy>l6Y_Nxf9+O-5bTNW}Poh!+H{q_ zwwSgTWUOI#?^Tz8@}_DRT^_Stwp~h0nTG}vk*7poG*4u3;GtmL;X-4mX{a4p5F0FQ zW`({_IG{FCFng_ao9~p7E5I|+1lP980UnOf-uv-MJ zz@>b&Yhud`_ISwic?%QOHB(u1SQ9*tKfpl~>5|Klt`Javwew2Al-i)O(G{PmG6?pn zJZ<~IGz>&(pv{WYeW$MYz$k&vteM98kTeo53+#MP4$|ohkMLSkm`MjF?LJ zsa#yBrf`EFLEH(rqtR63E-V(J61f$i1 zWEUHE2Pord*XJBG1WLv^QNAlaj9*hvFEbc}fZy#oh+EY0n$OJ!&RKe3V7F|gbZ$NM zTll;#z8`4X>KX)jeCH}O^s~d9s7-DTIdAp$?68?dI%+rK1ILDL)yk_;4oq1Bsg$9|lx!6#sI z{((4?6LTf~`K&Hm{PMM-uOnoS)P$C|zwsJD&pI-?>BTy3P{a&tD@1vIKYLq-z&~hd zSfXbo7}QSkalOo#zj#GQz8COydP@MrHfCmhtJ{*Y36?z9?=`mRBTp#(HOLB(i_h`4 za#UI2ynGXcF@gK)Gh&fW)_poru8$@Jr6Z7kylaZ9Qpnc@3280Hs;q z&9|0Q_Z;+YuGY@oBR#01x9>vmy_BQa$_V9+C<{_&@FdFoaet+O3Rf%#BQxy)G zB7C?@=CH^cF!ZM81lt+IQ;Zw%_FuH^XCLOU00W$TE{0mS-gdzD_HfDXK!M4Y5{_WY zs_P*(t+l;BKX8azui}( z>VfGY*nbqOr9Va*I7w|AhA?KP7{a|n9|Bq1o`&yy+e|YEr_g)j-pu%)$>kxUo ziErYw_w)I^9@`@+gf)E+OhW*p&#|V`&C=S-k1H!;`}}2@y0va`J~yjxjC}`qJ%uxe zr7N}u>WXc;W<%BwjD*b7M)uar=295Zv2HGb3$|w22Suw5$e8XPH~&UYmZHY<-}nC$ zaa{P{5yy*=6HNPdqC>;KAjeg+1GdZS?wC;$6#NkRXqQ@x7748FND88RTB{AOQJ6*!0*jB7V3>`3NlJzqy>_wpo&&7x^t{oC#!|(>dLC zSPLH8rDAf1t8FWaaqfD7vT?e$5x}?e9;jmy(zwRJw*?Y|$>crfm_qN%afQ_PPigwU z!^7GvKiYwB`x0AoUXn%)e=?#514YZ~{7Ct+{|Re!UDlCzwdnPjv^)R8q#gqDxo5*; z(fe|?!Sc55Y%|{xj{%3aI3&ss_CX^%r}LH75aR9+d?w^57VxN!ok3#rD~`{1mgfx~ z@@`0PWJN$K8!*vow(eI))0sdHQ37~{MKBvQ9efd-TGuGl?2UAGGI_boA@V@efb9l) z=?k}D-{f}cx;O4iA+|$6P?_mowDf=&gc}u=${NmXbIZyc5j!nd@|7$mlQ;RBe5%jj zWz_z)>mXjm;?4PN#aNdjfVD7P%qdZO+XBYZQSviZ5w%}!xvI5p$l47TfmyxH0w2T3 z|G_3|0&F7A1`Y+wJ>FgaJCh6?VN_{}?idh8PpZ)GHC~yqI-P63)0Jfkz=HAc2^qii z4wdljwfV4^q-;Xo!0b_eoGv2Ldof$H+R1n9qs=toqJ3}O_gscu@|0?Pq}Blx;cVL% zBK)|a5|9+1yVkKmd!IgS6=*5EPy@54dnL^=$<@ZeqNQ{Dyn;sIVyB``p zoM&#scwFtu&1)hxI2?X>VoWSIcGxCrEuEV5C5Sl8Ptuf(6>ELgrBbf%_Ejr9F0*29 zn}0vVq94smSla!P856CYZ`80$5q5rIHkiV-<4v56^oAcsgzenmAdZUD!3*~soUvUj zaGao0$jxItmhj83*6=j;M{EFzfm6twWGX~u6iZ&TOU!E|; zlELJ|J_9kFTE1a6j)e?=-pmsI3niD4rSd+`^CsFGK!Q4y0Cs>OSG#P^-au5TOvU^u9g*X>R+2FYlcs(6~=vuFyXi0FcpUGlIIoLueVRkvR02;!YGo&Z6!3*LJ znS8r}QpDm}-+di5*`=sgFDH0}Hn}A)H1gK`}eNv zPrFFq!{X_3D^d0*e<;)}x{#Rl*b7W1kDRWHAjNAJ$(qTgau*@3fC{t|(MfsV zVh-dhB@MA01DdDonq_Z|Ad>3^r`U~b--wnfs;6}gB03;9-qP}fDkhTXvu46BKPMJ( z_?*@b|Gbb$--vul5x8flAw~gNr!YDLrT={dIgW_Ov#)6R@O3YMa_zFAhx27G?d1r!s>afD%%CF8rOq)nIh8u5Z2 zO#E23V!+cn3kNUQhKtEdOpfOT5XlC&U!B{DjzJAfd%@d)yaBx#y2#vO7pT6_7|0HJ^PM^e>dFP1ytnj>f%#vDC> zk9f~^i)-cbBn^_}mOVLuk~_GvNS(P|z^TnZLJKxJ^PnvTyrDzV~lxDImtgZ|Vuu z%)c6zM8s^?NLKXb=(lg5Ajj7zV;LV7!ZdL2#*0guw$x&$ z%C%tlOhzmQk9qo+yc9Rguh^e|mRu-Mb-DKJZ;aZ;lq>$@2g>@@Q_g-gJSa#I>zd zJM%)vQD9nJ1R%-pq2b*>WH)6KV&F-^^Iv79kgYI$piJ3h1dPgl-xSkCfvVITYG6aZ z*3F;YyhpS>1QmYs=ke&AgJB606dI)J>NT z5XUf+OUmJ^Fdu^zl(y`6@v6Br0F7R?Zm41OicOm7eQl{1=w>OU+;5J)BFF=Abx0xI zjBdipOJ~QfRbm$T9E*N|cNb}RJL{d>HkqsDJB-ai>TQ2reBN7z%Hv|eZTU{`=J=ts zTbVQXV}$IF>*j;I>DK{PBgPre#SAY$Owus0G%TA;{(N7I2ucV>s-$EL$inanfaZyA zFj6n@BlM}XW50qqsOcm?Zq9ir2X)-&TZitf$9nrdUM9u9JZq{yYE<$8Qbf_C9Hp26 z+xBeKk3D(J2Ue|pJ0q*@*j=bGL4#h-)w&gaZL;kJD%f2yke7u(2%-r{z37~_Dl+TV z(qEUwA}}DM4A4B;KAS2mhWq#mtaVPUoAymwt$KJF9KzzW+of+l2%hF8<~+p&V)TCX za`m-)vR^3siwm23Y>WwtlpytUaUj2!Houa0H0ZBB^s>o3G}H^_XS*%it4fU0IHXZp zF#!)PxHknzM)tl$v8M6a{5eTG$76VcX#hl`w5EAi#pb%3_#WB<^h(upHhSaTnrT{s z2T>l{KA&~|2|?Po@C?9UIlYpu1dZEjF^HCW1CnR;ln-21w@)af9tXz`gH~}vY?U$1 zKqbn#0ww0@kYgb#Dczqm48 z3-B4#y1G;BHP(`2r5+g>M{>ko_6K+sfQcdj45FM6m0;9RfnNnMDsjCWEs{GeH(*)|}+aV~hi zbZFuc_Tlcy3VvIPkWFJER&V%xV4-g_7`<-#1QAkE22v>C9Y|F^uG*i5E-XE_J>%!8 zdJ*OqA5W+B0s$Z?Ew>J4$GQ%T0EttO@Vrb*9KnbCfW;ez3lj6801`U+3@V_>z9G2s z5XjK3e<|_3uZ~-7~rQ;RTE( z4-+*kB2|3rm?0cY2ZS#1!k>o%4(Dt$(-{>*)y!WR~ ztvlpG*#^-Fo&$itX(3hnk6DF8#f64iSRQMke`5}TE>Y}s^)&#N%D9aO&QLP03o)?Ba+6wD0_2MGiOgEDCcuL zMrHaee9cns8rNgtI=i-wUdIj)DPJJ=Jc7!Z@*M%@^UBpJCLK*-$!rJxxOs{>m<8IF zi5E8IvYinV0p7(%Y7d*qB@v!c3j0&z zsF?i+EP1mJ^DjQozL5FA|KtOetaBFq#at+Zc$VUdHN_tEBPaVWBWQT6E7 z^XDrVU(Rj2&drC}Wdg~KDB(bdEKwv%BI$f`p4Z2Tm8>R`jSKcm!zqC)D#y(t^=eX< zQ=I3rue2gx^*1JWCZcmL3_Z9gy4jASyKwwgE`Q^|eu#Ig&SXQDK2d`xw=w^8o8V)2@F&)<|B&r8sJkKilQ+L)U z1MlJQEx&}uCpwdU4vwfPLTtoC;3m4z|^T;4rxH{HzzUdv;~v1w(-nIZpmAR)5!>M=WWlHG{{fZdC$I5CR&{s%}C0i z^MRw1f*?6xrOyikNrw5pz+JnwH$EJxuS6k`%O+?sH1*fZU({lBreS6yU+N-|+n?FG z7N?{P@{6?wA#`d2br)*17&pqiL`?R96rGBg=E)6P!|6;-u7;l%d`~cMnRLn_lM5(i zCoFDhLd~Mv@Hcv*;5yrO1a8SedU=_;ax<8hjjtJ@|6tXhR(l5Srye^nssNZ_h{+x zL`ZFFi5f$yj@x>)*7^ZBQ!S^9^`^S?A-XNncymN)b_}mz8~}mH!M4PyNf(nSJvjq6 zD{LlV)QJMY-&Rk3QW1Xw{E*We3AufIXsuBf6^G(C$CFV1f&IcLV>jczk0hyqc(6|6 z38VL=9MRJatF)bpKXPsdececzG$|tN^qPHO&CFLxmxO|nY~d*Jb;y-(9nF4-KHjTx z+F)soba;Q$&$e#@4CXF88^hP)qC1I(F`C(qcrr0)m5!x980fw4zDpi7|GtY)+njfJ zb4wxc+uy4GgzfcGa|{xSoGiNvrNIo z>%a*Ha3r~)c$~DReTmGIzcyWqCg8YZ0xuI4X&MuJsTYFfwp%DSB4gUz)*H5M%s8Bp z+0tj2-OiYHe238Gi8)|gvd3~JGH}Ry@lSTovksZ&d;y<(xJ|PpeZ{`67vkNd#Z-zF z81l+fj2au5(`^QmKxMO5+JHbQ1XbnY<8ICt=gA4fZrO3V}C96f2>7Nmw291s#(cF}X=MAs{ht0-QvPUbJusROSm zh0o&JLuH+p^cpnPM-H6ZBZ$4!M6Iyknn4s!B10?&zslFtE8k+7)0SLGLiSq`KdHKy z1W;>Ya>Mw4lgZ2wg3B}*PRC&)`1mYWIwtEZ@%6Z;+?UeKI4@jA^Z+TIw!CK~tA(0X zA&C8`jz^|z3uj>kH>tK!bMS;sQ9#T~DdP5V=Gs9mZNrm=;~T|Iw_oNPO!b1YStNDM z#DDd5wmgGfXK9)dGdFBEJ#cDb9+(R)lgW=E!)ta19xz{>3)Fke@vDAnv z9L#%v4ABW~7!}r-E4L%dReNr+2#}NonO~%8Z$I4ih&{8Z{Sd94BcobkOIEN($RH0o z<57A+Nz*;#9Wqd-rPclOHaaF>|c@}ZM zQihVTz@BQBJy+MxpddFbz?Cqsk$3XG>jr5#l~b@}5a49ClFh{KSzmc08W{sS!OMbh zh`{hDkM6@{!U1dRLs|)kMZ}thJQ16a{v+K7`PkhVR1q^^l8xIMidAhy8O!91Od{(} zbX``4%o!YI_+l6XjlY5dqse);FOfazyB<_VMDDX8uOT~zfd!5}DT7RDEr*j!0(T8T zn$xz2^8Ls4MR8uZ0AHt)%7=574uiETC6Vg;?-D;?e6B?S4JRFms<_G`QcLqpS%2T7 zKVlB2Rx>bwEUVRNTMfk|fun_E7v#8kU4c9^TOoqumEE31Qcv^Qwbs0o{auG2CR!6~ zy6Idthv2tq+vDMZ&XwE>uR6!~q)lAbm5$Tt@8mJ|l zk_W$NmvzGUSZlLPA!mAB5hXI}>)N8JA0bbzaR#|W@X>m0&ALykG%Snq<9~l3XatiT zkP$L~puhqS*>1O>`&p^Tlvw~v27JRrinEoO{=OGCc*|(O@0;W9`V7t%|%z6Vs&N=tdm4HRx zvTeNe;hjYcgq6Y}Q*v<8^*)$!c&(iG9nzDwqLw7&c@^=6<_rUm;CrE{(UV;>wYQ-k zy%nv3m|o8-baTL=8J90EJ;6jG(#*P#-Rf$Aw+Mlg0Lbmj0rt-XMIx9rpyIp2z90)| zPqSRbN^nPC`hkxMjLupaJt$eb|AH&I3JnY(w>1Y668L4q-5*dX&GPB)< z36#PwX2)AwzYFa5t3&DwLPkL_y3%;_r)5*ROgf^+6^eR%g*1-$f6+>Tp!lS3(eiMm zqOj6SBYv&5e!|d~z@UgoRxRUg;A@Ry`uH(@Z^XCt-2w0mntpXfx@@veSH0R*regtm ze5<0Q>qyWgET%N$*f(`6EpIFeKfIJOmk9UMmVPgS!3&Jq{!nSY_*12MaPaGQs@?Zt zzvt@JH;P6}VG4VtC~TZ>V>pJb@G};>au6=}wAN|dK0moMVFJ`dQI(=RQ+V*1%eiYK zduLFK1Q1)1`wZaS%$RK_*6Tm*2c5| zGEFGbl}!&!O9@y21Vs6cT~kShBDGuj{MKGFC$q0ztsQED~Pe;&C znO-{Y6fCQ&eHR*t-hbkY5R{j#VI#wD*4@<|z~k78z_Sh_^m_z;f# zyVY<;xy(QjsVg1I#)&@9iDuZJ7R||jvS^;y7hQnim)=&e*xA=SqyhT0jm@-Fad50&#wfWMAUReXF|X)Xx0&lhKf_M;`~3%}sc9Iy z?Z-fbG(h3jz9t;@kkqfZ$Lkw}=b4o)_^|_Nb&~Cx9J=PV*F!Rstd~@h*!;Rjdw7Nh znM6vgr{9OiY(2j_mmpC2@!irV#AO$jrIks>S_;yQE~r>m+P=T;L=ullTfMkGq3g=s z>S>BbC6z?t>erZeRinRYLZ4aPbb`-6-~9TGYNtF_XeQThNl(gjUhGo#{`yId$t4HK zF(C@?I+9qrU1j&0B+xXJNMD2!mKYGZTa;$?(;w0;z#+9fGfDb9x^V1#47)t}}r( z^tJI5hlmpfN6nN|uAnv81Ta1}wS3Vyly}T@h_U{6J?1(q!EL*in{{=8sF{>D;Zn`4 zs_{SRF%|Q#z9~=^QSw?z&x;c_u{B6Y_|L(UmYFz#SWNpkEKkYbp>uaV%Wk5_F}lh2 zP9Ag}aXC~hj$4vT4dih3YNl1XKw6wHmeC{8;0}Qa4B5zZLj_?UNTAt_Ede*J ztxKR^{^+jJA4Y1!kl205fRXI>t=4jj_|=hcrY3T zxh_WEpN)Qt9ZWCl3Gpr z)1%OKg(PWOpyjg3Mx_!^ZK|`VW9Xmj-ss!2Y_NDm#!-M{HJRx_Uzl}{Xh@FnZbz$~ zvUz4*P?u1K=3Tg$U#K;UlVBaUzksy57%p>FYtG;M_j*dQlo< z6duhte)xo&1ihWLY$x1K31O&y*yyIEaMo_F|K7F$j1fY1w$39;N*1(D>yZyD^Q+Rb z)A5P`q0y&@;e;8=)$$kzT);ror@((Of&SwKE_2#C;SwtH1yd*vLAw;~bG4f{0>cB!g1%lRK0o@BTG~=R2 z<$X1_=q@5VU?$zj&J}mt(N6ROPV^(rn=YGSF55X>R+hz+Yzepq+qZGzUfcCS`bARc z?%D)c9ma^Oc_`yZMsLAccS|6PRPOptC!?P)`d^c#lUR0xUzV_g_)9K7m0y3fW3$GW zhux%hovEF}*q#^F572-Jl+1lJT&lGymLWeODZ4#cyk1H&xX^>gc3nclr5$ltzTPH5 zcYnsOo8`KJ4pLlF2`nFVUUVw2Uw^UPbmKE}Bq4m7AL!W#tGjQTD$e5SoBK9@aDD_( zY#LZX`ybLYoIsY{Yw{4q!E|#$jz=%8R+S4{vN*!NdUU$Q6_jH|#(UiF!Y2_JmT2Fg zt6F7mF&limGy55AJ&$C^!RxEsrDj(L7e32EDZw%u?EeycG7LE~82Milw z+kI{E*e(Mvs7Eub7AE~X8nHQh0q(uMy|6O;hUZaRu9W!6<+FG8H_{TFWf zmaPE<`dx2S?FoG_39uElv*2!3g4U?W!e7SNpH+LCHz?JN1+3pFwiWfXyC+>O`B$$l z2`CF5t#eZSvZNXG9!Ry^9zuOXp}Evvg9Y0Bn00Z=i%fIjxYUUGd~7dq$&i04af+e! zV+g}5j1h-zf&5Eko3|dj8q4I5L-L_G^aRVG?zUbJBzZ#-GhV_FiNL?e3oq}_n+FAFkNS(XUm|Mqo>RHpF zcLi0Tp&Fwmj>+tYaL*-P-)$S=g^SZ}ewGI#U$Fr;;8YUQw;(^AN>q0WO+rrvUUv)x z{t_2m-fKzW(+?!>Pp6NW&fD41>TL&yeoS0n`K_;Y+?6~LhrXlL(`noMu&Z&?Rnk$1 zY>D=TTd*KqWb(RB{MP%cCF&+QDvlN$)ey(&hm~lQ;Ww$w1MjNV&xbed4)ywr&sP&h zW!#Hko+$|oM5+`>p^>EN{8OE^zZ;FHHzT=GcYBfg{z5!YUTtgP&Hk~ACdF|}w#0jx zZNf?viaV5UlfR4NvvIQ6-f4AyWn=qzfhzMKE8F8FDX7$)Vi@z}nf`NAy&?=JE$z^C zUq018*VF&=XMbKgQ4`}dYO|off55+g_{57m0)g-@Gh{~m$Ey0Zv}e)b4lxN>|GBAt zjRyFlHLw4}-dhFL)va&73GNU`a19y=?hYYXa0u=m+}%9{w*Y~KySux)yDfa-?s{hS z>HY8B-Su^ys&ms9UB$(^SheQhoi@fhe$Vrid(!?#dH!=KZ!v#50ORfBRuuc^r25b} z;6S05trvw7zW*>9(lKDXxDW|Evj5+O$@(ah%wn7x@V~3h0s;lE?U~+p0}mPj^nX6$ z2Pkx-F+!WDKSy0b$dYSGcf;lFL)vCjJp_(xKmXxB@rC&q^CpD*P?Oi|!I7ekH!t}g zzqpwI)aCqF3McvhSW+Nc{=fbC4Nw9n03dAhQvdI-{JSVXb^Z^l3x-`^4?rWTw%zag zK4oiu>ms*3TPJfnJa$P9q}QrZyYYFhz2vPtP33oyZF_wQ>3K;~J-*M%$}$?d+JBwr z+e!&3U+NCVwfiC^_jAysqF7>mUe_l9?EN@;iPrD)r10|cXfw{lCkge64uzPR8dVHg zdjK-+O^%x?vK|3bsqO1eN;?zv;9w_hEtr?X@^+(}RPKxEP%OK5L|X_#>&@ii+oRkL z(5VpuEkHo(z5O`xIFst(_Uy!HfEV&fOJdTHZ}D~HCW}su%-O1lONQ3B^)A6z1V^jw zVfpR5^DSYg3i-{(Kl19yV4z+Pub&7-RS)=&Ixoedbf?S4Q^IO7!tO9JphHK zgU@cou+M55lc+;X9JZMQepD>-gdI%gEf2?iHZQph-RK8WU5o&*<=rFAvdc-d z!1Dv|{;A-P3eP6DuuI;|((sG-&Tymvqkx%Vdal=HAt@pgiEs<%fmVx8EIt&f8vo6KO--wseb(?E_S} z;_BjQ=bc)GZOlpv*%qY_fxy#5sO?ssjAFq8RxH0N@!%iU#P^KaDigld1_VE7+Apwa zDDX%;)ebnVKgxTOUGBDfNj}}A#eA1thU{=`y!#n%FHkl>uiYfg+hGz&xVlomD+O#h z8+Z3bEL8{+T{zi&I9q81=W0gjKpCoPNqEkW+r{kO{WL5;;MMlHx?+mE>#^jgXHLkHd^0(gryIp@{%d`fPl(*Uf{k zSrrh{JF3V0C4heaDhuy)l9px( ze2xM}!@h__yy`WH?>1&r@n#n{{tK`oS?(P$PfTZ5g*wG`%)usu7eZ>Ejsbs7%gHXE z)uIno7LOc*7ah=w<-+9LZWrIx&M9^~Eapn%+Mwaw1vZIXR>*XHu5%st7dit2?CK2@ z%c>0Lank0`DG0!3@*%%gk8P!h!*Rjacx-yjWG-6OB3wG_m13XtwcV9xyeV;> zFL{D*G4!}kpE2jkmPvuQ-B`D3aI~k}r7OJbx~aM#^%+jQZycofka9hT#Vj=GO$)nt zI59m|e^lJFhqKf$Bs~tu9UlB$s>$efz-l`FZEJUY2vm3)Po&8?EPgmYlqmxKcIbQz zYfRQtcQ5c_>-Aat@ZL0q_q>y?DI`Rr?8$L}iZZXWeU$56th zcGcgXbtq2TUfWuVI6lyIUQNoM7;bHHoMTDI_x02sB+OT8Yr1)?rAiGLN+1Z7yj?mQ zyxz;cFqmhEWfe!mf_3FPb`+;ab=#ftMRRfUP4n0CH^sp$~c zvq}`n>*Se_!wKJI%|P?C$w+*@#S9z2t*Z^Y`Fuh`u?oR>=*OkTH~TLJ=yBpB=+7Z8 zQ5I(}f6RAIZi=ez4R#5tM_V!SRKeb|qQEl33E75eTW(t!;R4s4{2gOU+sZvL_;Njy zQ8ML*Xw*&XFactTAN@L^8G4F)k10(f8PdX+!-!#jRx7fFN;wsP zo5ju)P0USrAq+7LOlE_n4gka%X=i9?loR|JkG+g~I5F}0v7KY+q_lnolkVh^)o5a@ z7tD-l(?UEPjz$$u!(6i@hwPTy z0^2SmJdBQ_d{|TLLgK3necTLc_1*z9C70pcU^Q`7B3|#$Dtr@Gwl>QGz9HsZ{vKp< zLO7kzk40@}*ILDfLi4q5mpkvscQlD6->%}Ai9JHOb<($|30R?!-8>o08m{^`bsnfQ zC6Nl$!#ra9@9-BNlF2@vv|QCGTg>$sgj;lUr&soZWSh~SEeE8!rQzAr@W7Z$U$CfT zSx#Y3O?nC@IT{w-na_r>Za{Yy7ClYEz9csf@?f)yIyv8E7(NsgTp})sYxN5MGBnvx znP{*En+>h;w?vk{M(oi)$+Ftp^ty{J#$n9FXap>3C#?|TEjAWcAs_W>>(&nPcNYN9 z38*yJ$dg%(f8|;Tn^s?BA3Ul9M%Z(1dtQUFl&r#lltg~6Op8n+Oi(yYE5kW(UblJX z#jna9u=1Pw7LfobLz;6W<1~PLN5iG;IxDZc=2K~c+sRb!h_|dhZ7s2!F4nUo+39a@ zQSzdK?MZyhAeB9mq{4VuAqV1%g(4n}FHMIAlYOo)t&cZ@hWD3)PRx*4YB>fVG?s~E z;FMv0T@9!2GU^7@H1P4cgSSAq#v)&q#e4Is(m?79z5AGd^-}R$!wotO^;)xgf5eb8 zP?Hnur{kGS5Z~2$z(vni)Nsd(7v;0pt((AGcg75a*>Y9YhpPiJ-dT_07;FJsAF&;c zd|=nrt#l-bNAzH}%&Mhp&gO37G8KY)ZFFO~2-DZCTs+SuJYUF+(aL&%x#JRfzg#)q zZgT7nxNntjkCX+IvJcf9E~qcnnN#vNyTnnFd`5K}h>oy1_c)cDA0x{3Y-H(^A^3LE z{wj8hNk)Va(uL*E-DBHUPwNzc&lX*0r7;WylNq11++XPZ{>-9(gxjb@u(dIuAAufA zuc-p5TDESMQ8%9DjzQz?kG+tsEs{${I>u(VCAGn!$FwSWen_!_x#x;fG1?M~VgcXQ z7uBTmyQlymne@+(H_QuZg_`~68UFTgHlmKS;kYkV+~`4gLpVLVVWJ14E3KNUh1$h= z(&^m8Ev8tc;XdhBVEvZPv*+(Lx??EyW##;y7%Pux$`}XxHYmpdPX%!4h;cfJ;MY6t zU=)14i+I(vZgwuPbbEldt|u8F@s6!B9>S&C;NZZ(p|nTrApL1i5W&HQT`3CP`3U^}4tMe0@0zw{@_#JKrMJ6t*hQRas!h0EP4 zNO8GQ^ufHGpt9(a^WKlm)+rz0se&7Qz-ex<%Pvlo%xR;uUj*`F`UQi9+mFxbF&BQZ zcW>Y_J9%IMdC;8F9Ojm&XgKCY%`t=w#`W((ooskI)EHttgd}zx7U)l#87!e+uDNT5l-{rzwzc z*fa%*UJ0y`IlAo0TMN6rA+Ij?EI#IToLlpn*B&brw>RQ^)I;Y6oJzUTiBN*A9ly6! z+h%f`z7lcUe~;XYx8CywT`;;xo?oP2S%VfHUmP4}LvSC#8`nDinh8kntZ zh1JG{YvDKH3H4Yp*o zna}&;ADrYK9<$+4hz+LlTJ?A9Rq`sQidrdl8&p^-BSRkf%mXJ*PlmoCW5i*DEILCa zLM&1YHAqto0WV+H_F18gKn2)gLZy0Bhmsps)GQvC!;Mt*8i$)BZU@kq-wupIKr(lo z`{+^uAnHhdzCUf*N}c~g`7Po5xPz=-6v!JIPE5Jzp$02UvU*pcblHii-LCz!dNnVE zco4+-?E}N*vOqg_@NPZBcY~<{*kr7>!D{-fG_TBr%VmrHzI^K)S*GJkPCNgognd>M zv7Jxy6Hng*BDmWDR^NlC$TYo8K`x$l>!}azLSvDdu9pXUFT|bl4_7Tq)CU+XM_2t4 zL&^L8q;ss)TRrY|Ue$n~&t`Q4f*fFTaew`0pnsam1cb_ln$KruUuq_l6@2WK`dX+O z&TVoT^qgml0r5KB`nq0iczKrS(Z=GYuKfzaIKb1}RI~viCtG2q1%%ZB5C0tX?}hd| z+0oW?ofoNNAmcQJ)}bRxg@XwF&Ca(e#1IM>OiDlVTJsdojBeGeJlXVj$~*f9yY*vE zTYQWc6m58qAvQb3tsbDu1!0ihxM3LJCDeH0JJ3vX3z%T+2#k1PHhhGNQ5;)#?iz1_U=%S(=8%fDsSU%N#@6b;jk|0rLj-@!euZm?lBBOv}gP5j<*yHj%xlP|=dow?`{1Zc8<4;9vak+KAxr za?ckGWWTaBk`!2dw6$5cX0oNFoL@QkocvKUhJ|;63cwuOZ*uE<6`h`;6o2x z2NABwqo6AK%I*@c%?^+l%-MRnCtT~lCW69Qf05)aoT zP!G5J{7Jhi=P73`637KffQv{O>T=l~cAD2IxkxNkBRdOE=4Y*`;#K$p5$PrH3sf6> zJV4jk@jxfy_N#hn+#Mw}N&2=P+8@UKkmT?Y&E=wyljxwj>I3aJN)T&Hj`#e|P&`NX zG%(Bem^DNjzw3Zb2EG5Fgx+8LNtY%Ya8mA_3i>5@9{8zWUarZB0_<4eH=2W!{k25z z>zfzS&ds`cn5oLJ5A+|6zv&Of&Wefa)S8r@zv+q=VQf?x9z{854B5AwVWP|T;kCNb z38(NsnseMrwrSXaqy95R`Hf2{`zQ>LU0>XYR6g=(5r&-oOAIhOP0@4M9f^xF2yRFZ zPhNsbQ|1H{u8LLd3$K4KRQyI|6`&^R)DMDc?W1>PoY!)Q;0~l;f<2{IHwd<|0aq%W z-A0WwdADQ(f++ZBYzFF6w|y%hL?*}`f*qzD@?rkUpuvTV!7VRFp|@3Wgy-S zi&~C@cob5L(jyT=g-XD)7NT5ZrE?dG>2DTO@?y$xa}STc_i5T>&$GEYRpF0-aBBL8 zp@jLn>S62aJy7P_WkPxxRdFICt5sau0U~0neZfJ(K)UYHi{it+Bc^NH8ox±ZkD zjhN1DrGA6V6AVM$XUwx|gR-~Zo08Cn=mnq70`A>(qc=KEEXw9yj+tQ*k;v%GqkdPL zjxD8|gh47UKd$={O%VpUN z?K<6W-biY>-z&F{YVDm+7h7`FFmc~BxgFIAUUQ|E(pTU}*I$UCVC>Ld36$*e8#b~g zv8k1`d_Zxh=L8Xa^%PMGNvr)|R{Qz*cO3UBHCK#fSC?vMe3OQ`K~H~-#@aln<)O4KY-B=Yq?Y`^yLg{fC)BS!j&laKfgl z%a5RgOb}XnN`c4`bAA@PC3zP-i*Y|Z4gAtbdyuCI^?jt1!wab& z7gaRI@V(GOK=bFio#s-HGT2HU+b``!mmMDrH5czlBCR;HY{G_!|Dy1z`P6_Aj;WT6o7NUW)L ztC!-luhgG4N0IGN4vP`6RsK}|jcW)gtrv5lczvEv-%MdFbCQ`4ynRzicqFsZC+;x< z#vVs?yW4;VnYm8o%H&^W<}!ow<~m~d8huu~byK;Vy?!mYK~GCOKaRJLX+;CRI#aSm zkEz3>_0fk4s907)=Uyha^|st@<5D(K2`ss2K?0B*Sl)w5yh9fnL8yJxw)w2cKo|pz z&vFZQE2mHPOZXdY(|T4UDP{6?&Kk@FH%&=Z+v#7k61(J)pl>QkA{WKkmb1luq6v9& z6U#fee0oeZkVkwF29q|tt4p)OEbft((3#eNfGthyM)hEHB1}4jV3h73PrW$J{1KtR z{MTZtIQ!vmLmb4#3XKaO&d7C?P(k5+a-7Uf;1O?yVQU~g3z_Bx7a@eoSY9PZ;F-uxqy}`9|j6G2a z3KWpZoRp#9>$dlsLAUUiw0ZTZ<~~f=mN}?>zi*FNW2D!A(3lKNbbn&LI@S}`XhNFg zgK|mnWxdn4SFEClb?2iWGSJj{de=wnen~3d)_GSwUYFt-VaNm|mxAwS9+?<+?qa%4 z)|!CkrA5@5qHvI|XnX##dAyx!jXdqDgJWLoS};OrMVyDn*J^3N0A|I`vat+89?w1HyRQ~{EvT=b`V({>(2TB?(9To&jDDX@JeOVZ>m z>zy0_$(eYdeOBQIskxp2mxXtrjUP~|eYd-di7^r8E1owX>#|yQoMiR7k>5niGfbpc z9@^WnVO0CED|LARQ)RYVbldOmt|_>|HHvY7D1G=$y z+slL>eJNYjErRT{nvlS~?ZNw{=0$Qj*GY%r|K#6me$x18&6@8?bAA`R*yL0!QT%&M z|MOU!5}4$IXNf{ig@#0Gp1aJ!Z7mBfNxm(pVa%hkrW+e%oFwwT3AYW|RZE*5v2T4V zO7MIHOO;OBZ?hHOiewue_pUSs*O-J4Hw)Fp^wlc~7g(^qN%(=pw?Y_;!Uy zo>Zx-eG1cXk2CCamVf*hho2S?SnXd4e2qo%qjMPiB`&?%}=`op?G&PrOB10)+@L=t{ z)t&rbs|nLw4)EbQ0kJLa$((X|Kdt?lj#=(`R!3JCThcna%L|kVdl^a1{LV1rEiR*E z&#BWGxb=ysSfjB@T^ddVrNrQvQOlFWgGs99+`2|qpwuC908^X&o%M#Q(nMImU>DHT zFa0oVZb*6pM2GLc>pHXl#^(3ANs&I@6s4EXiI$BLKjIzR7p5z?Q!K@N75Zd8tET;A z5fSKO+AL-jQGU41Z%1d@8;qz4%_nVAs@;}`*mVUtw;dBCD_GrOXKhZ_ACpg5KL~T} z)7+_fU#<$2(^(sI0uG_<2QvvcOJC9B5U3e~Bc!4BGCIGm@!aMW;>eq{=b=)UV5$k^ z)i+uKaxta}Aml~3O*{lPuhQ3Ev(B*wax|@6gy0U!mrl2)niJ#Ca=5`gJ@;oQ@7ZXg zrD??^XDz3=`Yx`>eLBpdcCXWHB9nh2EPfW&`Vd_ECh+w6j8}7R4^0AOG5q;YAU*zX zm5OyWG!PH8CTEX67k2R92I!DpLn|^eQ%@%VQfh0J~-WRr!|6o_e|#+ryMtu!Xn(H8x?D8--+w? z7gf+CihTHt^mAho#5T%Pp2JM+JoyY*!)pI_No!?1NCY;~k{)0*Ybc9ULT^gCKB5Bq zR5&ey+x?%49Cql{)Gl>vV%QeyG3MRSL8TMDOoZ>lH1uMsis^Mw{C7f&K^)+_aB6^t21En5PKIrUxOkOcxFUt;O0-Q?JnO==4cJ{uoYNmCAvuBxz#%_0pyqX+C<>tGgYx~7TO(SWpk<2stl}FzsD$l=x6U&1I)qeB+kL19 zk;<_X3bra`JuM6V1zob;T1brP0d@cN2>tI_vwlc3E1u#kUz0L{QL= z*Q~@_YD6pFUIkh~`ix18=2bwu_m=@d5e=A0*J_P#vipelK;F!?M5yz4*(W0u4!rUU z{WsU8TPNK+{vCPWPZSvQFdy?$ZnrFXjeIq1Xfq-S%p!COM#QD!b?j?3=|=@0$dV|xi~=x)EWMhoI08g+g9(LL$j!N`o0lLk|py>CQrS}%Gi?H-z=#EYJ_ZKm+o z{{XX45A_V!W4Bvu#_D$n28pY~% z$$GE$zQft&zxq?K1Y)|v(4IK+3y#*qrHW_c4eawOeI(p9PG*0Ux!sq#DgIuz{k|$O zHM|@8Y~^dzH&V$&j?x=NaR-2lz5lA(!9tw6@xJF}6N#GM)Cn!Ix`;Gy_DbOiOGqGq zpp^(}NAUYmwrhI9$ZGfgIVe?OPCn~Pg6*1d1((wfd!ytBTV5CnwLAH-Wzk$Hui?y| zPNo={^^wJD`^_H2LD$!3?_o<8m6fy^!ei+R?SuAMQ)NyvjJ4{n4QJBJ{uxU9QPaGP8UtqeTKm7iE^4w-uk zN$n*{s@Et>wbOf}I)oVIvmTj=hPXMZjmWLj?w>X^bUV+q_!677(8L*q*Pv5`%+6YM zY1%t4ut5%^vMC#Fa88Izh^|yVJSpuqonHJmUYBX7_wwF803TDLT`#+Iu+nj}s98XE zK9Lzv<-CK0NNp2jFX)2H2-kTL$V=SplWQCl(GTW`ual!G>D!ZEYM!kKzJJb$TZ#S{ zlq@ywCATsRGF?$(7t-x_v1#|ZbwacwF%v8bWBYPypO1G+?9=8(qO-|aXc##f);1pK z>BsUu>=8zz1kP^#u~Fkv#qfZYi=z=|(k>rs+*2_`0PHlcUAdOcD^2t8qvEk)Ys-6+ zBASUGi(f5!y~;tt4JPbXiN>>Hpu3u}hD}FQ-j~C(VbaxrzpU8t0ci|gF&0BL!=deb zoiiLy2QETrh`P)vz1enyvZZ?0}S3+TfTMoX*c_NB~EEZ6tIaSabHaV zZ1#E*oGZvKU)f4QwZVLBO}%~Sn<(5lS?H=Z6rg1U!Roh!hpzARqKFCAw*}v=V3wSm zd8O$+5T`*@=)GAKKd6nt+lN~ro# z=X@i%#nH?M>~EvdvU!V2uy+fPD8o-K*h%eQf{^wdS(is!w$J>70^{nTR>>#q{&n`- zxyoxw&T#VivA-=-zgOW9u!P*FMp8t@0&t<;pdoz4B}$2qzE-PqsXO@^X%tTL3?5ZlT-S2U8$=3Q z05?5A{v~B-0ObRgCjLiwQ;WlIJJE}hk>bB#%Kznqgd)rjX=%ms6m+S7M0OM9eEWKa zP5bF2{}El4e3!w@EZJ%;@cEwyBk&dKxINN(* zye*#2EwTRygT}rG4iqJ`gl7&H9gD?4bkdvh{(}M(p3k{I9D1`$LQrZ~}nDb!qPZ`z!x>3I3m0 z7etmFcy!{}`Ze!<$>lE5Fyq&GUEX%A$FW5R$oAu@kHk@w+iV&mE$ugelz8zOWo>i2 zE8@5O`Hs5gDlo|su2~wY{rM)v6M)Ttly||mhC*pvJ};gfmv@mEy8wENiW_XKEX2lV zmw-qSX#et!zpYl*?)hg@VMZ2F~-tbX1MBSHa-24H-qcqhAdh$)SA=Bbx5$uk z38B6H>%`Mim5inv%_I4RT1?i3Q9}OqFu%u_Hi1ttPv)lr_j-#gPKyCamqCN=q@WAk3?t)Xa> z!L0eG??;HARI1{)2NJ~L(7pCCcU*wxP;9xpJ{DDTuvv-I4!~L-^nQXbw9l)Zb0OH&)93%cfc@5BK?wN5s0W>cG5v@(9Iy-Zfmej0W%~6K!7t^z;Tp19j{k7=@NqLpE#QnEcDd-rEKh(LOR$UaU$5>vG7T zIfKw*J&Uwh>y={Uw*IzoJ|iZT&yBXk71LoIpg|?UZ=v}~&LeqGqRZPK6q_86^tw75 zmMf)H7b_2u#}8d&VLpO{9>+jCV4s(Szc}iK&5aZgSdini>}5#QwO`jus ztWjli%aAN!TqT`!)zp&bqq3&nAn$&C%n1E(!wjD6Cn-zB%9s5H^d>AZ5c5miD^+b8 zkLNT&7RQnF^BAwx1e{zxH(|zRqP>3 zM2?ZzUgRB6{cfM;!_CPub!~bfWA3-#Ovc-3<1P}N{lsxR?kGBQ@BhA65E458TEnPk zZ<59CiEFXlw7$mkg*)5DYO(1$wt&NW$!N?;arG#r{AS>0G+lz$s(Qv^Vc?t%T~x06 zT{bz)!-C=c#iEGGa=xmZcz)G`a>7so1%SHSkFw7vp>1#V`@|1wL@it{$a)_eK*pl= z8Hbxt4~=%FDD8i|>Zn(_#Q zqb+fg!BjVg(R;l#tG}QvtQgR!kkKOIaf)QWf)?4a6?9$fY&III*^k5to9B*8dh^=0 z(7K#6%3=EsMThYx7+dHqgHswww^}E>50Ds&>1w#?o^#o#QVPpYmicps`=vH?_kOZD zP@hllkoZfD-p0p5o#x0;Op!7>;xwBx;vW9<$n{OH&234^LIZU zwr9SOh$Yu&t4yKwwSi)c?aDm*G}c=&L)=9vEqLxLDGGQoM*V$M7vPjXUjR`5gn7Q z%l-&RP3rMR&mQ6nD;XGpE`c5b^Y%zGD^x35Q`Rib2EAo(F~b+!o`^en_%6B~Pu#WW zYLj*|M+(P1bz$w0rn5v{?-;V~;J?kP<4@Tl_r=|3sSUd#vsR@Ynoa4&hIV5u_2Fga z0m|GWb9$_R43o>AVCWe0mUI+=rU338X9Juh?L|HZDjn zzZ+#MqgAF&wm257AuE*)Iz}x*XSmlZY+0xcqmcS7Et@W2pef&!_W2<7UwSkdQdk5h^%GQNahvFoW7i#Ab@l9`3~rtm zvkI1mpH>BRtVm+QVCFDae%l%9+f1W!I*`)ont1!18^@7R)_&3?HTK+&UUNDlify)4 z9DvW;?0i501dql8&(Yz1@K{m~h~Hxl7g<*UwnSzaN17|}bfz?g8(b*D>$cYkzZ2R| zlIZvR_y>LEFxmakcGJ#WqFA@Q>}r2j#JT8Jg`6HC(=uy5Q}!8dXkR0l+3D9zvG%fS zP$~E<6T)5oxS<7*tl{SvK}88p<|tn?;{j2w$H3{F!0VsdLu0-6wpi9?eE%9ItNF@m z4qr{l@sm|)KfD7bLIUvS?OI$Ji1!;{L==w987}|&;-T~#`|?Hux5nRzYx|X zCUA>!@d$`hmv|&rlq#Hv+{GMSn#*2oE5K+vxD6<9#)i+{a`%mV?3zmPLwr1)W}hoQ z`TxYP+5`M*?BP}Oiwb|`SQ;wcwx$NXgMfYN8c z%b7ol6E+jp=uuy$*sLOxEa33lZJf#$_f=rIO@yBWKj}38%e}oxgVS?xgbYf_NQ{bke?-rm4{)B;RcF;rEE7$+G2c;@SE0A=o;yf z6zasw4ic}Y(8ci>{^mtuF4qZ~;K4ZEk~$JEw*37XA$XMF_H_@yGU@YFt|t1ldcY~v z+7pw&moN0i>$$7Na)SHC{M&d_`JD4juOKcv#gnHbZGTp4wY>MZhJ&VY3P( zghs^s6+M2@^>jMoazIn^ah(-q`fczh*+BHyz*%C*;QUf!c(6l(Vnv7HMdLwy`svcK zmVXmx#DO@|Hqq;OU}OuI*_hj`{vK{4>nzd5Aa(km3fDD&jr?^*Br7xgBJRuI<&_pT zKZy~bO;1xi};m$SKbY&O>R;$Hu9pV z?CA&P%!ug_fJin1X@61v!b$WvNiGuWxl)uxMbd$?>Eo<{xL*BKB}D)ISM)<{Gop}0 z?ZU7H8=MwyAR3m`u1G2Mt?2tFnQ|7so+r>{6ZM$z+u(Cd2f+AQ3mMCEv1&&#TZ4VN z`rVOxYQ2c(#rBXDQoXe{5BGS*`Bs!uS!el|&^|;~U4B8O{I=<*U}`lQmZ#jz3!|V5 zsy!fzp5oN!%+K*;K_Fg=5z%3LK*0fpSk!tjqm|#8tL@QMaYfwt`K%j#IA60)E4Dt} zVomy?7Z+*O(zvo|M)p%FIM3$M-Vx3p>{`-f=)61##WOrGv6AF5$ z7_~9@sebFGb1FF^Du4Sq0E=%%fVbxCy|e00p{ zv<;O-lHispzm8;s_kS8Lx@JG9QnpY@kwtSg^UiQ!SV9xH-RV52z@+>&40k*bY^ktC z!!tU0UTO0>AA7wKVibp#O~Bse@p8#HD`nChLLigfROt+pKE;9%JPUnrjR=bU_kFUA zZhyP#(%a&6}D{ys8wuKRJ^_@mlRu3g`D<~lt z)$z|yOv)oU)~cQ9(Ujb>NWB_=w$-if?lk0@>-=&!q~Wd5I3a&FDzT3Tt&g|KMk43c z{$wV&NV{}y%49YEU5FR8_!R3)M|68ZAZ81dlJ2nG_S`j^Nq(##o&J( z{C&m*#F1yewVE6oVn%keITyeU>kYlM`bo!(BV5UGqZq{<)=bkf3 z%(Ax9F>lT_D(4YLKwm}Q==iDWp(s)5Aw+y|glP=n+H83C0!XBR_E*u{X!6ssWcF^y zCpG}1vt)jc-`68QR0Me29>(Lc8m%3v_bDx`fm=r}giNOKX4|X?9B^-i7daJBpV_FI zB<;YM5W)VSaErW3XoqE{<=?;0vyJ^_S zkk`9s^v3HtDabD|YK*G_hE?78mxJDr3nrH`ahYVxd?Vz%e6p8J#&*kH$r7Dov!|9iPX3A-RMdcxw2QIpi6<*xf zm25fP%MeL5W*7f?ZWqs&8;8R)MO=E-L-_=@!s(>U;@))E72OG`NbSFHpbjSK2(;g3 zn7z!0$MiOM-O3d^KDVmvw*2k~RP1OHA0GYS*M%^2B7q!Od&$SR1fUm7%6oR4b{h|rWg)i{ zplQLUw|Ic9EB>Bs3Y9j|eYYyCyOOUd(?d( z4aNFjGSH3DYsCGx$G+MLq!J2LS}01ZI<9lZHRU2yyL`5`VJYZ4;vKR^fXhd>tyrkK0>>x zk!<+w7Qm53l)e#(VTKwWlLOjvNq?ZoiC2}l`C`dKk`uv_Nhf+nYk~!e`eHPqb;(x;W?%&{g;Xi#2%ffXJklr1o8r7v|#&v zcbg^?8%)(<*s2MO`H`W$7Yb1)%xn_xhAJqh7C^7_HaNc`cK{Kz2T$r@PGya$Z^t_) zaHZul42YX@hgFN9l9rAP%bEj8asR+{YKbMeVqdJroQ~X>KTD+xEL8yMf8Vm-Zwbi% zVU8#Kfc)RNyEV6C{(z1K|KNVpbSoDW{2$>~{{h}jHEDZjx0be+=(K%jF&UvzzVcuZ z5qDzeq)Q54;}_58CqCd1kKE%vXUdr=OAX^b17!NDtR&wH&Y84~usWXj0qysE@T0eS z-FB-KUm3~wD2mSpFxeR(=<|SFsZmcsdA9P&{{DJD(VP{3eNLu79pr?UVZtp#wDfDR zIzrifkxt^-)2=1)f-eutxLB1L?_iK+)H~&IXm;5sG(JkY5PaHcNfdCNmYD>9BPF6^ zSUVXtckC~CPv?C1=lMAst$R3yI?4>eU~FAQgG6Rp;XMbdIHFN(QAXd&XPZ*-& z87XX%@c}2E7>Fme%4N{!t@@ho{!%VBK5sr9mJ)%xN5%g}q#{c&xHsdg6KLvBEmu$6 zD&4qE><+nwb3cnAT$En?38?^4g1tQ!V8)ZqUxX3?1DxnLh%gk<+K?P?n?cO#0wkzT zLz|rgm>;?j;UCj*72ZM_!lcDR|u)5RN zU+h$oM1qs?TgqBGp$Ji;qWPn#W-34Gj5j7pvcz?aCDMgoW;W)d37m>(>Q^wf0s+QN zA2$oN@w{(u-_EiDnDjt&v&99L3}5Oxuate}&(r{J)R;j%G*N0nE?5j-<#$TaWvafx z-jLiO#d1K%pS}beBPMyzi+z)70CICm?5jH4cZ`G^5NSdf-~nmE1^q5RtmP%XLKERW zESfLZZ0{^#LKek}X#4_;oYYup4`I{`9_a9Zy&TQx`0|v*NjJfL%inTSeDIMw>FGnU zmxl2you?=+qpE52pg@@aJT9#1tyDUDTS_NJ|o~j%+BE z?`(TT*R_H6{liRQx=0t)DF)jw`Nb`-F9$P4#=G=>^Y2yoNHWr}a38dC6?>yMQ7AFO z88Jp2esW``Tg>)-Z%eo$UUoTRyZu^DjIRzGCcrT5mG=>0hnNF}Z}99d;+AaKiBmG6 zo-M2GmML?Ep-2$9+#~a*8*g%PS_22dQxI0FxAXle^Ad52$%LUR?Z7w^vDUOe;jNOi z-ian1XsMpFbxp1Jx&ZC4zdm>)$Z5s{8zht&ul(P^sJjqUyrU7B-+$36b{O|5=2h61 zi1p5^@8KI=c_GU=;-s-dKnS+bgBsJ{%lLDPZUrtPe#n{H{@mr}6&}-{gi`s*&x#?E z&y_z3RW@rP=JcH21=ARMGtR_KNa`HT&VP=_v2FrX*bH0LEhYyiQfCl>kEW!s>7g-|L`Jw{7K;b;LleC)q(;! zZA{ZI3$P;dW@-tGe?qZxZu@~^YQ2zC7>b<@6@uci_$1ukwe+_2Vc#GPm4Iu_tMiJD z>*(%4_3VA1-@E<|AsBAYyNa<C({wBC6FxCky3S>gF-j&YQp$vsYr|Y}Te$$+3!_{g^)=W_@U?RlDLsmP zm-QaZJCNNzDqu>SrG=g~obi5rcY;Mb`HI!o+^MC2&YvV2VVSaxPh_IfEQVTGdNN_{ zcM+)b|Frj>K~XL3zc!nUL`8yRB#VGblq?w}O3pdw9EJe_0g)^qAW4$soU?#}WXU-* z40*@{4B=h4ANRBO`May$Q|HV1aHy#%ie7ZMSFc{(cVGRxwMemPHcq~wE_q-o@9J>( z?;H1<8->z0m3hLVsd=TaV@hAOITN~)hBx+ujsQK;ryBHr*84P7GmLjIF+&t5#<8@G z&v4uZQE`^3=dzwoUle30ZI*BPrfW`FVjowg9zCs~SE5?cJt4%Q5}wLWd)M8YE;By_H>d6b&(&{ z9i1#jm9d49lHW6-`{f6o?r>uEhe6c`cV1+V0$xKs5eflDebaTg$5EjAW?!m+VnysNi*yp@>v%PK7*j{tDV3 z-g)vLH#9?!NHC&-UHh2VDM%V`r9r6S&vDhoX(N2Bnx?<)wvOVZirY*DF;PkRMLgi{ z=pViKSXz!#kndUqcW1jKeV4x8>i_BLfRG7s0z^uWfG=`5Zg5$Y%v5uRzgZ@oNgZWs z+lvpb`m8kGKneOd$ebFg=HFO|Lrzv=m{rmY5HTf&jWU2pM!z?o$|u*xVB1oCSFsy? zSA|0@g)^gz!j`DnoEJ37c%hMUWYJ`2EX(J{AP`FEp>R; zQh$ExKvRsM(6~&^Wt9NQ0vigcViT-$3F$#l#^yofzXSta2ht{t7xvlrE&C5V6+W3R z=RkHET(|!42|0qJ1{QS34lN}cL30i4rp|NG@C{bpaHOqi#}RWT5Zr?=T+j7ig6J#m?~tu zKIcA^xizS5;C}aWb2~l4L7&ioqiCYoUHYyqHGfw$hfw@dUUcp>g|0VL#pV&pTE@cDuLLwOF@Z8m7uAv4md0;CWZj;u8z62V9p5N%pJGfAT~|o6kVKFyf*=W~j*9{#e;q z^3zutV`bCY@90Z0VxBNf3dyD4;AxB!^dEKI6aQEY#r+DY_qv27esAet zphmz~085boXE;h)3|>t3to@P}DRF8{P9?nVr|D$N&WM^a*+Tt!e+|;{Ykf4Th4 zM*o-D`_p4$JC#if90C7xQ)OJ>0bK?%hoVb^(#XuF;{Pn+@tc7T`` z{cps9AjL(ezA7pz&&)55Z*0>K20Ar&jm0DQ0AS>+8RwKb{XU>i*X!7JweB6dDss(y zyPIiAnob{ZgAwpXdC+jv)o_sW5{P762`JSipIw-p1B6Z)7nZSM+ey`s*}#muu^|)| zm{6qIt*v*x0{yuQ`IZBjfS5(>EyN-h&%zWZ(ale9$nqkMJzDJIeymJB#dT_}S@F&Nzpf216$vsJ z?oAUOp2*TJX@cNA7=0hSg&#%GgAw@g@DILAZ#oftz*PPSVFNo`UoHZ-865yL6Y5Kj zAw#MFG@Z8j--y}iZ?_lgND~8Ec1ASiCb$&;zqIVG<$BKlwf07#xW=QT@%;WgXBXAdT<> zW8ge#)a@FJVV?NV<77N?>iftTmpUQXt7Ge+&rL`$)h8yzEV>^jt3}0&ikkkP2kYi+RWPI z_bzwawIA0lBTbh0-lgWY(-p3XSfjd>E`<~M41oIHw%G(8^3W*&fB^BFl*q0e7Weeu z(}C~`blU<}1Nd&)YV$q~?JAQ7iaB_~V;@j!GtZ`bvH*;Y5PnK2kPFUZKiv9$*JW*q z_wXQeJ2KR?Eg-1hyUfGslUp+OIfv{|(K~AY3t2z)pp{AnSYI4bZ@14=Y4;7Q&G!ts z00n_8J_NW#Z$l@o0dkvnB4V&!D9gCP#&rS0E3bt(ID$mxFCxc%i}yaxRj?;k2d!Xf z(5E`nzY{qd)};J9=BsIe_55yIc5@#U%r0OlC>Z@$OSyqmc=ugul@IwX%iwi2Ef1F* zAkYkdo`WVKtDWcY31N|tia`8pjbc59{Vs`%m_evc4YAjG@I#)~gE9rK$HPsi9F*nm z__TrDN`U5_d_1X3yBk|N6=U4*^F z3r6+4hmTObaav;ElC*^H`9{OM%TJMOtrv$=yK4-*2N$T%@zWU2qGCDP-@LEJOyzSH zd-hB7&T}=AmyDo0ijsF0*mRc#;2HXwN7dYJ)Pa@QF2@`2_ZNc#xnd2KE=L2rFQp>4 zdk!JyH`is6m7s05y07mL*EkQVk8A>f4Nm2e&kD9HOCtB`^^jxL00EkZF~aER?QV~C zHPKdsmtB7d(9lV@*_=Mb!OMPNFd6b$S1z%PhExnEK>E%x{}{H0At0q`1-=Vy*?O2N zJ@*#hT|L7OW$hPwv-NORM#?vTTbabfYA(jO(+m)cqV9=@=I|RC*`ED-}BikO=7pYfy9&Fa5hiKW0^L~h3 z0NHRKosh8P%F&8_cz9vEI{rOL;dbkqMKAx;RzkAdMTr&u9uQ_zChv4N;>97ruf_Ez zKa-t)l{=<;?dsa1DZr=P*dKhEn1e;nY--YlfF(j+F2toiuQWV-Ws}d_)o-mz5iN;x zce;4O)At83*=?r3yjevmz6u{xp32nM-JB_ii^$*`zW*%GKingrE5`mOf`b$MM+dEM zp2NJni-5Zo6TZ;dC&7cW)o11YhO+*P;PmJKgTQ|hIK)1JpX@rUw4_0-0PXWX2%I7E ztscNqSP`K`8fDIbrK+YhK9+~jJ{ZMOooR;uz|6_Bazn$O0-;Aw{-=~2Y1BWYnZ=z8_J$?%G2E&$=^Oi#oED!s{YioKU-b_l#x z+>oCHj?3A;alvAFMHo7~RXmda4__7`Agmf6n`eO#-f?(V!kg-Jw;BK?Ovsz2uh()-mzzt9 z%3%zi^Wek+I01UvANeT5KfXHgZm2rE(04hNKRy=TC!TJ0PXT;Nuh4C>i7r-(OuLhp z0C<2aOF8?615a7V3gML+4a~klyut>}ShuSc{9!F{ zKI=d_jfqfdoR~xtJoJ&zEIN_{^Erw?}K(Wv@hWEcHJ-e3MbCNs1=^4}3!>JO)VI zCatp6SY|4OM!AS#WBRVa=Q!0P-)`onlHG4cU5IeB8amjxTPCBwTkzV5kAf_7e?+X4 z(7=epWB1{=cf?QqbH}PU)eGZ4Yd-(+ybQELao>oP<0*U|d#L6~dm@`I`|U|@K(OQa z6lA@bawU#Q(?L5|&b*F2E!A==zlFze@BC4Ym))oHt@C7^7B`xGRBXwRHmo`44r2Q{z@8R;JVN zEO#JNPp2V~jj~0=`p~SPWRwy|Ifx4tJ^nqUwRZmXP75H`na%m=Jv=2kAD67Ye=_FtUa@sa^TSSXuW!FDYl}q-Czf3_5KKj?s>W}g|7KYuueNv_0w?eZd^2^6)}>`H&`g$Tm66&RZ2~6ga&DdDt>wk>o)^CZw=8cF?Hn*p01 z0I@q5xfF%!Ra5%z@db_>GsxXf#GTm){ts#sTdySaxk~sl#%|`gB7*Gc93_MSk>pw| z4HJIV<{-3NKgbGD9f$9q2>v{MlX*|{7idF&3)+ZS9LFQR(z-K#`F^)ft8-zwb5YjW z9n}T9cZ+A2Kk2Y^W>hIfHOckvg&*jI&%gCsRTs#qaO)z&;au%Q3%vBp>=QNZl3mNG zKNyu#X#j21aQ6ojX?!HB--JP3ZrS2pcLO?IS9ZL+4D>Jjp;-aC$QkO4Tq(n5w6-aAOj3r-bO={c)buNtcb2#Ge9W$E4eu zWzRx3zS+EfLwnbq?laoCtWYR=5s6YzmxB4Sz)|EMbI5wNCc!EXEr)PQc4{Y`Um_=p z)6jl6@`@6@NZOq{9jRas73Z5YA`411xHbfNB6GN&+&Etb$(&%b^}hFGp_Maxc~=bE zT2kfRL){O=)4iqlRpi3&O;^>(EGO0)niW+uZ;`x`#iV_=;=cgBh-#SZpKECJyxwam zTXmj?`L->d55f_t4V;$3(9|LCF^z^VDL%zT2XX;?30fJFLUYXcVD^Gcc4j${lWgQx zW}|7j@C1x8lC0qZ%D2c|BB*zvr?#r!`+mk!%UyGba)pdNzH6i0R0iRvKDxR+=wo-U*(C!5|kun2zCLWbC zv|FYkz08m_wdMua=WDi#4|CbHtz_oLEIJC1BRB?KPv?O#9+s#&isicbbe|C5@Y7j? zi?g__>Kfhd+Utu}8V8|=?p*X97qmj_LyUMGy_;Z5$~V2N?|1W#?+K@`NP!C{>Cgf? ziN3qFe`Qh4`}lA((y@L}(+LmWF^cdz$8V0^P+(^E;lv6lLfsyVqxa$*ZWMIT%9E@9 z6yU>)-u|d98==HTyN!=JcV7mJ} z$o&2*G`8_aulCmhTRr6!c<+h|GAQk>LCJ@}CcA{YUI=^t4F|;BQA{Q3o=yg5cV_3! zt3B%N+Q(`D$9el}tU>#w?g)oLWKwLGL*=4~aXBz}1s;d-MKh`W?{) z#sy3bnPyu;Qax12y{3c)_xtm=kptV6p*FH%Zh6=acP9ez+q;_AUzLqCCSlTqrt*}He$aMza@EhDZ$ zbqqg=bQfG_MeSDzoYC?ZL}+{o_=gWBI~$cVRl$uE zV2vUSGp|*{-qr1^!o@Eygz=~5ypJv|aH-XXb9icO3`~dyMa~}V@a|J1W6}_ai$1?M zeqhw{lJ~9w;1pwG1r0u5F1mDao_t}Y|4!{L`P|!k37m>bR1qMv4b$ru*~JEK6ME)$ zxFk0fhGYw6!sPx-_%*506BW)W+rXhBJpz@uI*CVQ16ja2fh}YGWeas@P?%hRt6|-q z-_$YWWJ{7Cb(`R=E{!T0A<_NqnU*{RN}*$PO9Fri8dO zmu#eqoXs=x6Bm`DZ=e3SD$$tt;aN@4^xX@wn&tAPugT&->lNoklr)qgdsP~0ls3_2 zT3Nk+=kg#yfP+Q)8+^NU=!Yt+#txUiFN9h(S4%R%0+(U3crZQ)83f-E@}k3i3DO%9 zAwttKYqp#2&Ya)_KhWKB&=ib0qQec1Y}g*!p15dsr{woIWeYDP;Bxa#P0!jL^fCiUr^)JG7MpJE#ajQ}y^g}ijGrd=A-H50=@OB11lON7 zsvdgb1FlaYVHqmado@bs6xGE=HZ>eRD(| zQJ|@69_lWPrUdOgU7*vUWECopaxE}!bTR{|cRjjOSC|(gMS?^Av;#)g++{$)IT7Uw z^F}2!#j3>p1N9+RJd0|V!uRKWQ2sHG@!a*sSgMPtl*)L%ZTXwD>rOOmahAAs3adG0 zzpnPX9v|IK!k98DWmmg^<@|Cti+*E#TSFIavWA}+(KuY4T2=XJP1f`WN0-e+w*@63 z;m2wiV^{GUy4l)Xf5^uW%WV&NEQ+5{QQZfNjIYIdHlgnb=+t6r9FYff5`Xueg3Iqy z5AR*`tuEZ=Q7*x{Jha-F{ElTevLwXSVF}z zBpr4_;#d8BD8jSM_Cda0MOKPr{oBQo9FVJu*f<)yw);NXQ^=LOZb@fhs=GT}6FbPK zEI+G2;{$e)#zE(cUd_Ul(C^GW@I=GX{T6z6UtFTFTDB2WNgdAtbgi86N6m(R*Fhj8`j#C?cP zv{Y2m#UV1yRag4gL5%^Rh-i`XvEwAU!eac5LXydH64jiCGJW1w>wH)9wa%Q$wjCr_ zY>Q5-3KOTL;zx72kouhzZ`3i-Y>9;}aP|Y$lk}2hCbBG-b932}u9L0tgE(mx?YfZq ze6f5k;yZ80ic}{JcWQ9SXU?z)=jk;#E>Kr4D6AJN;$T7s+{KzJ*~fDIf*euu9-osp z!3NdCTc!o$6(g(GDligAWV*2FXKFl8|Dp8E&PQjOuy+@J@A)CqBlIpJ^$T7vI$dot zB+0TUd=(27M_M!;Z-Tjxk(}AN8n1moHfy8TRk{U)4->qEJI6rEEaY6V$z&NP-?UrY zELtxx!bunq$s4WtMa`Rx_RUq}lzz!K1foa*)tGbJjKccOZpj(_ zY6WIK4L#3^;#F5$(X6`x5k_R4vhfTJPfI_{ltCm^NA)gEv4wM&Mxg9Cjx4 zKDmVr zAmQYzV3#YE1Moz;b?Ghg(25DP$0yTS)n=N830(Je2*2qq^0AOK`LtlbCmq4-$^?8A z0d%#NCQpvJNXd?d-!o)xldm-5hbFo}(`+Tf>Csj@CYgNBKUgTtr$}Itm?3_+H1sOP zkvo(#@-C+CaH)B}Vl56tdt`H>j=aCbpTgDJj!*5d_Ff)C{CylH%}$r9lCZJ<-sYAr zX|+y!Q4e9)`Se?Ex`R#>j5BSa1Wy%7@fnB4iZIyjefyNfr%^Y9E^J(KwXNrq1Er8$ zlYS8Dq{Z`xFya|MBiF%H6%ToDIA#-z_pUOpYo;*LdVd;}wTPG3ejT|Qb_UAxeOwaZ zsoxul-&WFNrO`M?%BEX$TEwlJXkmoogBJ`-(NE;QZxC>F>gisM;p0|^=OA_D_0EO( z)Tca}y0Bm&@M7f?z9KH2veT>ft~@I0JtmsN?(qG_@5ywUg81&=g~9CSM>B%TzrG32mvM_zJZ_$) zOd}E0q!T&ge(3>zLAkWRq{d~#Fku-Rl^b$eigAdrxkrKZ9*fo z@q>{$aUHyKQ7dxoTzY|0VhU{%cX#KGnge+z{;)23s%hRXjeVt5{$dep@#9J3jUr!H zZlU!y@>{aVtgJ!Qvij7C3&D44fiMDA&62|arQOj+dxTk&qOWyvM*jAJvG~sQo{(?4 z#O|l4yS-_k8!-Ih3fhQA(Ehe(zwtAV-_dwt&qyI!^fruixoxagx*ofzGG7#rm%_uJ zxTUt96sdgJbsR$wWnGjf09d@R)a%*p2}&CYYeI{E4`+mN}92AP9XG2)Y=Emfgb_qY>A zf|Y+?#a>@+F>N`x8*jGWIa}#1<)#o2>0vTmu6nZdn&&1o=k2qsTQ5! z?Kq_5o%F+gJee!|ayh>997o{&R(h!Gz#PUOsL$$NlNl$3D-*`P@wZgdmlI)WE4$or zVW8SxD9aviUl~3-=+4mdxVh4J{1^iy2oJZU)*oWz{UgJoP|~l0sk6$uVd(?tUgy}mp(4S*K%iVFW1l{w;sa3leCF@M>()Ds1QU6<_A8+HI z2-N$Z)wY0cDtYzF60{_-ZV2zn>p^skOwz&`o8J=2t3`{R49Z8p|KO?;EYv)bP8FRo z9u@mx(!*)jM_BdraBNXlN96nHp&oiP<*ug=wZM#X$0VTy&gA#>KDj%k8CncAlj+wy z_YQ<9w<)GSw4g;D!_W7_Ub>Z54D$YILD`bQeWeBbF(i@&%sfqq6r6!m<4O)e;qc_Z zNw_mAr!lWIEf5KL`lTptU91anA$x8{JcH1dX0L3DisYnfz0-08zhJi{O)`=YA*Hm> zu{@4iLiSc1q;FR#(KR!oFI2T|FqgO=JwM&!BV%g?`L+BDBw!^ddiy2QjC4}*HRb3n zCe3RkV0s=?gNn)62;~$C3e(<(w2%M!*PluM{c=*KfwoJ2_%nx}+5G)0U;`PIFnhKf z?N=FpPy7S511KQGxcu#HY5&dg*KfmUn2b_&#@N42#C(HAAdk~&boci;XJHe_M>up6 z|EFpHES3P1kS2*Vr2XOVbCw_hQs+b1epe@8(ZVSD-OW9u-y7@<=w@BeuLl3f@mJ%`KtQdguNjbj{o$YOJ^;E| z#86nv@61Z@{S>HG)zwQRq~BI+<+ht;5Q1ob*QA(TpnF>&5|NO8Tdl85F$FTHQ4 z;IKLw5f z>p{zCN1_^QPqzKb4S7nV81m1zK74TQxyR`= zLzcs2xaVP?>TdsvhgZ)c@BwJsp~QL@g!T`U@ayquyX_46jtjAHK0!s}Z2 zO)Dcw=Y}$l*gUJNG03@G-f-R*&-4 z{*-BS71b4W4OtBb_PpbIHP&gz+ssS>E>KZo&Lho#SdYTtNj+-2R}wX_&0wo zOF3GGet;*LUb+ZE$@vMewWVn01H)0l$T*(nz;gHNKbrjv+!;jcJ=m8S; zvu*x{R*5CEel?19yhn9qjn`7FV29@^M;ADVjUdmG#Z$%B_eGH?)b-uMzHob0$Y~|| zjhBpe!wTp2aBigi{Gsi~TFOdJ z)?ZIr04jD)u5gYjtKJ${5lFJr%(7u?n*;oMFJDiVHsR@z&@xI`p(%HLZ5X)5q=C)L zv9+e9CUyX^@v<)ty7A-6bbND+v;1XI%2lfkpYTBrq5XOv;SS`0V@5XbekJ38&PN}S zQ`p3}4{QcTfTB`OxZdi4m?l)!=m4H3VE_F)$<Q3u*HWQQ8gUAo;~_q! z%rEM4ZSC3n9{HFq#-aV|A(T3zmP4m<3XO1JYpm3qDb+~~({?5znFrj(dU{Thp$m~q z*jfL~I}J%3uX5k^D_>`Ggd+xgOl#Q;D#xURI7j3X9$Qnh>sfV%h&TYlggKyCRvUu5 zr3;Pn)E-Pt_Icy+{SDgfuoSSvR=x?v6^@yzhNPX2HrAZBUwVhWwwmf83_{FcF{PW~ zI@b4e35h(L7AC7K&U&{5Zb83*yviM{ncy`vHDtf}db&4l>c4k2igB;Td83L$JNDr> zy}q->iNnq(wln)uuW~p0RrFD&Y&Q_2SSXKw&*tBm~QD}jHLE4Dfm9Ad-0gQ!_N z_etgPYg-q%@ltTKwH>z^J4BS-Sf}B?Db=azz#t@PhSw4yJe`|r)IsF&S((q%k4`Wk zZI795@9CJC0eIKFDR2AA<3;eHNB1txlTzIUUaZ*TbeQ_ohP{}2p?CYJ4>gps8m&P2$1I5c3}b;Va)3!juW zcj1RkS05}XC;OfsA?$(3xrYG~^IE&e?_Qj*Z*%O7TNCa^aK6;awqV1}&0(N!lLa3hATqs+0-^Ygar$Imrg(NGviA*P26he?l zGf;_c6~9|k%=R>C{MeS;xa}n86*KAi?5Tb2G_HI4&4lgs-iva_-J8~vy6y3{nFVi9 zEt{`@YZ1nE9~uouZDZ*|BAqlI_5P&=UIc+A&H#hXqJNI zXf#DSbbcU!FsXRYOk=ygEY+?dHO>cbks} zYvPpndm@(??Q;i(hCbAPdf5K8_~_x9+|MWLvGT@P*5?c>^PVhx@OHV*H`|Gpj?YRD zJinLrW*p6(B~KK%bCO>xXZ-Mh9BfYQ{^8$LUbJ6`rYjyty?yhEyY<9F_h33N6AT=l zgC=r+^?74^rLj3ul|kt~mZ^@0#*v+%HJddbdh*vZ^AL5V1Yc4hvF-YzuS-y%0~qz( zL*Jg{u<7#72I&i&fw|P4f7ho}N@Mq$bGMy*<0E3DpSK4tHn_6ATCXW;J7~*OBwh8# zxEV;fDl9R$DyeBGDzbMXEJe-L&6~(np${i%cC50HuZZPPILcNF1TTCP>Vz7N*+;s0 zn;g@#dUPfG9Wf+Q^_P+EWE^@HA1O3&)K!Iuign$AdCc2!A00A`+n$MLZ7A{|NECS( z6X`bEX{+^!1uNEMNrq3Y6^!}{OP<)NP1kHk=fjOr{_I%VlA@hf>B2u)D>}hK;FqpB zwzK^?`V{?PLiI^is-4aN@6tSF#Gt!i93RZ3-vo^Y$`4h)A#|28Nd8(zaOZSX1iI zA*k?HvtaAnMD5F0ICKmA`cBQhXotFSx;i@J;kFm*A9EZBi*tTDuyRckJSn9XDg~1syXcLzq%}bKap>ZZOAbu zlh~>+FjL3cc4v`c-KF>_(p#FxLMK4HiQMc1)caTnaiDK0gr> zx|l9f?LrV+4)kBHZLoY`)nw)`iUWD&dDUs-Y>m`{DCK<0j;9Q^Sc97ag zYqPJrqt0WYp7~aqE&4*A=(x>widY9X!g7D+IF)+ znjnB8qbXLOdcu$4xE66#SN*Gg$Sqml6!Fv2OC=#6$v;l7;=@n+fMbl6_bfA36Hg51 zFEpnQZhB$=q@!M%_UAtVqSI1#c`E_7U@0-%+dlG}5FyX1~ecVJ% z74wA@N3HfiGZ6 z>pLq{hO5)42;Ji@NJ9<_l5SqwGUT{ci{szcIz9kaibA zgK>3Mz6+t-c0Pi#E-}42V4e5ROEZ|mc*?CK=}pM%z?Z~}C_-Uzc$dgG3-i;>X7yF- zTfoP~hWPh``YZfq%TjnSLL)8&(vJq>LXAw+zQ{Mz(zZ*Sb2V*Bkpml#^B{lTBK@k* zE!SP-wda$DCEZuDJ?qgfe@H1*AD&*G^Z0d0h+w@m=Iz&poRf5G0msqoj;9hY?XUk_ z?a~Nfs=0NVGBzJ(m?d6pjfL@m54D<(E72zkS6lpM8Qe>qIL{R7l=PxIsXQ00EfXKD zS_G$)HyVDkAvvcpT`-M4dC~I?d$I22mYzj8iSJTK$K_t=4^-I}ZoTy~Az8XRZb<5l z>ceU(*A!^9EPsuw7Wb9AKaXvRZ3etVR zqfpuBzY&1&T~cIiJNF5$vG)U8A?(@fOlrv7R|-c9NHPP%Y$Obh<{$MLE}>=!=whhO zb10=}?{;i)UpbjYKDPGkJJHw+lq+4}9TG99d1-@K%5tc-oBihFW^JFHLFKU6n4COb zU)-1_7m4CcK(06YHJ$oA=>+B-eFZ?Lv%6AJ>^EMrBf{?6=Zh8Bz@p4_KEpm)lSVu+ z?D^wR5*6XVPt~4PPaPLq=YjZ##+(M;#ZcB~WWmVvJmeKl$S13i_5G|&o);`GGn;!_-59**{~b;E1Jt&~r6J|2cZg04EbgSH zs=@BS61=>xw()>iaOC)>*Q(+rBV(xMh{0WA$F$P9^Mu{Z<)P`AKZL+|+e@Uqq4Nc{ zuPz>p6Jp@`Pf%^n&qvd%;7Jofn)W6fot#KWz7(4aU5JSB};JI znH9xJr^`fz2XL|c>wKzcwpsUzkjaJn{2z#2)&%(=QiCWJ>A<{V`bB-LIjT#&ZwP|W zoBJb!%l3r5$4VHJgs+yp=H&do%ncq1rATj!6F0V-)U8C))HSlD+1JeHjDJ4cSVTXZ z5*KUjDpavauppbV-pDrklly5|->%c_Fco{R>i=sC^#H9C+Q|N?8c^O4=h=<>m2Rz_ zix8TpcILyp;E|~u!9@Eu`wBz$gqpi zZ&NEj0M4u86JHnZ-&)ix38`H|TKAJ^01nFfm)~DP6p+23+sa?0*C)~(RdTa`JxBT5 zMt;SPf28%y82_Vb8!Hxwiwg61R{;-u2~uY&=dNmZ7~MaN@SnnCv;dD;*vb0vua){g qa`|2j_{9F-KmYwk{a4$~UkWtBZ#o>rc z2j6zZ5p`5cjiBDU#@TBxPzl{qc?JvLyfSP<>f{1 z-Ka`jX$`{0hNZv@}%t&_8;^Vt;L;d~tq7!|quD(eXqt~6cM)F&&P?%FSE z(4UN-J-hEdF}hhFfZO}_3B<;VW98FqNafvhhUdOTk_@{L(Vn2sR>+JCher2Zc*Lbx8E17r%5OpJ*0ajW1;bjW4~b(Jqo5jiUwP& zrP|JN*ycr4*`)Oy-HaboB)JJ*VQh(GjL05{`e1FE>V3=+of@tHAO$=TO=R)c9YlXUw^RT77r=!nr)O!bnosyaYhEDE>CSI z-;nANsaQK%Bl_SjP-Vf`{My%a-d!XiG2m5ggcav2zjut;pdI$%D!1=UP@B)(YqM34 z6ZurEF%kW|zwqYiNY933ZdBmbetURmr!DLKVW6)sYdU^S^C$0JDG{cO!RId2afA0< z``3m`IZF{Z9~#^F%DZR8M!m^28T$d0PjLjhH1vfFc5mVc?>}sg6y%sosEuUt>I|3nA1FW*)ls;@fQmltvieyIMAg^ftXThd!P#-8CzmyWQ8=zZq%} zD1D=-z}F6Ds~~xVC*SsZh+r^S(3Qv-Ur2_#o`mq*Qy-$vZ~jk-WJqZ~Noy-y8wovl ztkwO{Lb@zoi2TM|nXejn$+wBI`pq%oxwnKL@2(Rc z5WO(xRS6R?$DhQr`KBU_6C3=Xy~mXqO!DA+#)4Ztp(LKrcc)(h^$%!*&Au<)kj^EQ zdrrk8`^P($dodl472n-3cuUS;tw5|9S524M&1r>0DJN;+`UhKhEKj?z<>}LeKkhJ- zRo$$*s}&;h2GHSQ?)-|7$MQqD{!LwKhbYhQl@&abR*qb?H;lq1f+gNIzeD`EV+mh! z6_x`(9>iUXi2jkcn7Smmc(m})wY1)Omel!CTvzde>Mqx@xj0q!d+zVZ@3d{%U8seK zUD_L(1e~D)Z4spMq@*`*Sifn1DtqpT16H zl;>q^h^JR5%&p0F)0Eht-@x4{O-@dZP5!L?h1|^ky>`PGPhp>Cm?mw{o)uXoi|<_n zmHFqA3eHS$rq(=-CxqnQN{^T_DkC+O8>D@oqv=#r1VJ{6?sz5rt^3LnOOquyXSl_<9Fd&ph$g--()a_ibtl zX{r;~YtabIKD~y1%=T_mFblbUiAk32AKvubka?e zA4W{7*2PLmy|OwnMUnPj?8`Py$D@lr+w%+93d98zgf{3>G$Ep?GU&L8}Aa} z7YGn|^AN(%Wfx=5X5Z+LZ|@8fFA(`8H|{pR=->nsEk6t`leQfh-B>xebMm3Ku{OI_ zA7Thm_SfvvV9_j)E|6LK$fXGSn2`%HsWb6bQCD^enZMaA+U$Sw92g!Lb*^@%c1DCR z8)g@FNT^B(B~BqW2uFu|hLglRdExvb?S;ULy7#X;Z!hsLNpuRSnv;noktZo7QRP<@ zHnYM#;s7Y|ZSgMOP+x)llYwpB!zdK7*XuPKZ;MLAmxrw#Ir zcIuj3m|Rv|bj-&s4m+ni(k)E@z?pE1CCjSm{HfKN(n3zCS2==OtY{7oa_EKdGTWx! zmODB;Vz}dXhv*K3`h=4I#SKe_rAmg0;r7=xOO#!4YDa2QYF(M$BB!lg(Qjr(8p=f zkY%Z5u}c^=lX9DtTaBdrXj?ti&;0Xf>Y+y)JVi6PlB6PTGHSp5pXr$5TeWY#eSK#@ z>CUMukV-rAC?YO8N{m#fWyt?$?A!L}ZHoJxn?{cH!Y;chOhw8gAnILQ06AcFhw+h0_^n^VdUrGCrkL4;qg+nv}ru^x_(guzYCylNL!!rQMX6$Y=Q{$ zO;YOa&`Jth0o&1`R~wC!3a4Q{PrC#Pc-Lf|wRa-L8^mSi53QN|1UdXukx3VQZvAe9 zdq?-D*(b{WFzzunXufHfQeOnD+1dO)-tG_pdt|6=m{EI#!s`ni3H9Fk1gjypCof}t zbkc^3Xr$kufBEvlZ|zV=$|7+2VzQ>qroD{VEV|)`o-&7Wuacz8vSZUp(zyDH6mTZF zy3nEOY_-|Q)ri38q_)vXLYqx5-AuIFsjapXae(+S14q1rzs~EgEwN_-6!@%3`&853fD^yzBo>nw)nU2L`uv`Nchj8CiDQ>DcZo3w2S>rd-!JO!?5&9U_DR- zleH5!4M*Qv0!`x=U01pjAo4_5*!9V_hJ2~46XMljd&bs+=9D$nASl8H@zWnbZ;egRy8)ek#j=O}j zgi~FUvq|GXBd}EHOC5x}5~+6a={!ViRt$>FhXaA!YxB!z0wfhAQMA319R9iIUsg$$ zq#hD(Umv{VM{FAO==}Jc)OLL(se~mv;#Rh1R;t9PpDt$h1f6y_J0XLcqUDc3)&LO^ z!-Im^Y={|UvnU#bF@tqkbcUNnn+Ew;?Uwe=gdrW_HE|I!QhwRHfo`9U!+IlRX(?&N zq~wD91k*5-7Y(P;yP(u^Tr=8$zT+d+Zi=r3^0)+^!mLEWLf3I~I`E{MiEscv`(q@W zhWal;0eBaYwyVDYRo54~AL4wU#l7*u@;=+fXRf)M(l?6a>hUK#Uo2Mb|2o)~w62uL z`L+J*S1r*azh`(C)oBgqkd_O|klao9Vk?fkEm12Dt&}_##p+t?E83{4;;><_Z{XlY z0B{JfSGd^EL+l3&b`!&Ku48}iVL#7v@c#Mq+IkNDKd#Q z=>KdXioL$P%}r1DXA=(x33`214LVsDH)}dUuBTj2=_PN{(b0*!S=oqcK7a9#?%4lH z(A#-kb-;B@zO_ITyP>Fmz%*C78K=ef1J zr5nK21K{FJcRB7W3l~oh33~cVLjU^r7oXNXfPYhRcK^q;uoL9Ie8SDc^_2TxV`ICD zU)~kf0Qgut8axL$Va0=`At@vxApU3jzdiXk#sBE3|L>kW0zAC`+4Vmj{dZR#cWXCU z7bh%D56OQo%|AN-=fi(=6z9I2`hSSxFFF5t7b|GVo8sL6S~SU)?9XvsVeThlMOLB|dQQzL z>z)>>PIHcS^VOg%_hwk;YunYq&BlGP3uq>WYYmKQf};jc_nX!vU555*GM2V<%b$=F zkV@m=Uj5Y_5c2Nb4We7Ok{0@<$mM_AP8uhq1CL;xEV?2V2k!=5@Ks;Z6gNn7pmYds}Siy4q?LI%qE0` zbh6Sqx<6YAPNN87*DbL#yU(UwfuxjpLoMbt$<>#_4q7dHz-eeTGf||Y{7hC>4l~=} zUO>*KtzbV<0zj1-RK2)EE6HKqmwI19*TbXw6oYK_GOMx~81ZmK$VMM+B_DtzM&)yw z#b8|`<;hK2M(_>Pvt}q*M2qmwqbL5T9o2DCr4%+MWDBOuV`GF#2XRl2Svp()jpvmL zGs2qFW(NIMK5Ak^7%cGgVjn6c7Ey_Ombbm<{wnI3yvR-S?4CI9bnRR_!+AYbLcrCJ)avvE+NUY@P zYT8v}3#A#Z#V+F_+w0p`@)AI<@TmUC(QCQ;E~>;8ir6qYq5!MU7LYYT-_ZdV-N(Mw zrdAVdc&;{4u+WtO?Sf~k?1qfZ)P@_?#B7a}kXcnbH9@wWEu(5s#ptYPvM=g=XyLQn ztU%ngbYWOqL&K-+Yis+7^PlZ<=2(BYzwK_(;_<3cv3ognANY+6^h8&Q{s^-9HvB=- zdiR+Z#MY=FZmfkbOtQve>`MKnqr_#=&s0gdJ(tI~oVkGc^|L)`+rK;hajof#T*z>6 zVGEdYB&#Yv>5=q3v8HQ=WG>20WhTBesU45+Ns`1CBuyy54m(q6py-J(E?bY`g@BHgI%VYHz7@bYCIIH}?%-@yK;$K( z_6rG2{y|VKD7mSra27qtvZwI=-VPnmuS+IavN^Q+D(1T<@FK&KdS2ND^AytAx*-y0 z>O8k52p|Ha+4VkzJD#=8tNAFTVjgZ$>-G*0rS8BP-R!-HOy_w`kGlv7dc9)o7WtJqs~eTw3De5H7c5lFpuBe<60I zy2J#NogQ`!;n*yuO=pD!%$4KHJ8e;}A8fb7tXaC6$`3~kxdqxb$HqKQ$2wr2WQ+?; zlmbO5I1AG}OM&(m4?Xf3$z3HOSulOj;r{lv^5l1v078DQCOh=ntwvgck`TYStSOn3 zl&(9I3r(@d3LB^N7Y&^)?kGWs>&Y0a!Rd+>tn>B>C0OQci1=z7`UfsC*QSb-YTKs} zD;DVz8MC(a<8z86jLclf<8zn% zoWpOTY8blHLIWoP+XJEQdyYsa2${JmARN+|Tf06GH&(0cH0Vd2GE^kK=JBP!o^Hr&}o%MmM{910pwLp@{`ftDex;K!8%_|V(gWMo0RIy zSlWJ;T-aVCr+|zF^oX)yHeWR(W_u8c8KHxm_)B_d+X5VNgbKG3C~?hOBN~J8$EfuE zBn~7RrgpPr{MIt=Qi^PA9kNAp2n)HxH^dnEK~*1qUeW|7-?_@4R8~nZE;LytXg{5t z%APj03MsMR(n9h$AD21r|{HL?}GBUmA$Qk zg0^<1>P)Yz`$K_|phNxiB*~gCh1s||PgRG{t?F5Fqq9CUI|eKgeFW>HpXH6N+%flv z?CA8vA|5t*;DOZWwD2=1P56JjAB?&{uVq_f?H1)+GlILOO+&)WY(|(bpPIo;{TpCS z8A#mk00eFb40$#SJ|v%@Vo`I|M{P+GrD)$`gPg}`4e(1lme>lzJo#jUj zhQ2w52**$$37X2{y;I_7?%tF7gi0wo<;f5BQ z8g+rH(qZOV*4m<@@}`7lV9b~aT#+$%vhe3guqKa;FjQ`Zr*3>#xA8( zFBQFF{97HxzUt~IFEvFM1qLkm1h6b~~^YFX}#iLI5wBsgL7fO4cC^vpUx9RR6{!KT z=)j3O#oExhn|al9M4h5CCkF&gnf!ce^WK?}rDsKP`=oj%FvQ?%eqm&_GMgQEOvJ8r z@HnZc{pc1${FtYrz5k1tN~@m4N^j&mzo$--b|HVwzT@-Yd95((E6I=0#i!wR@ZV;c zVpfYcW%uP`nd$CRgSsCXA8C%43yR2GlswY5ccIVqa!i|RwPnjnj$JVjs#$yecr%Vx z$`lKdK5Wu86CS~Vs5uW35j*tsz^-R8#O-AsL`d|v*bxhtt*+72(=&2#sDvh!{!qJ; zx4-ODH#rt(rhQi`xJv2%y2r?k#hJdeNA$l(ysTKn%l9@#?JE287aavtEaKgh+?Kn- zr2jP$I;~6WeUm-&f1%Ta#oo)n$>{&NG5>=v>HN!60J*N{|3b$XJL4Wq#e?4*O0d#X zxl1RV&069S^?N#d*cm6GUU6UTN-vRr8a-BQTKXI0zo)Z&?ZyxbvfcFe>%%LD6#Q1)VxQ=X8mC&(?|;o|)L1_|w95va<6v-od4uu%k$3az%}Ac0 zB)7FerUv(hbC(?mZ4l^-Y}fJx4T3w$-6H;PGl-YCREZJKmZo+b5gK>io_RArn_v%n z*fX!~X%@?_ATD(Y*DX2f94SBX&a4Q+&cs_o@FAqZRE4#CBG+K-^f+5@@-9>vWR#u! zt~+dRZF@IS%FNWZ&a4SJPsNIokkBpC=3toYJR;yliT%uwG@)Og;4J!rZ()YVA(mC zOmZFGU^i2sL0fkkB?`Ih5Zo7Znq!5*&tv9}G%je|Y_rvfj%-(YkwK-YQlQe*`MGJu zo<+H;y`cw)uT8*MfhQMZ(MZLMd00=F_hI$g5n4#9kJp_q3g=ADzmQV+zcm4u6{}}S z+{J_?%_$>H(f%`!e$5z~y@bxbdxDgNsxo&s`khe}YE>m_bLdSsH^&fBH~^+AQXh$v zbBeehn+fI*Bh_jw1Ge%aa?L{myhMtF2(Dl6tLQ?;r#J}pcEh9+%ePt?efJn|OZ!tL z1un^i?XUEu#=~BS)1|N{r-Tj5H|sRh%?x&Ae(ic69jCr`{CaWPo0*HNN3^khL)qoS z0KX8gpO!`L&fx&U4|$R$XK9>RARNsk3`%iKg&^z)0+i| znR%GG!F?0~b|K5jBMm2`=ed$ys;a$DfiJHBfk6-Gq>&MNcB`n?9uS-}-UPzn~R`YN~d6prI$NOV{#zoyhYfW?j<_uZsI_xImwq4Kfpgoh{&-$|D9c1T3 zb!NF)-L@x6hsrTixi>l4=DkA?=UkcEQ!Vc{+P>u_?$zGbt=h?2kkA*Vu^_XSte$&uAmGb;EL`4;0;=Wss5wj_D1>AdZc;yBZw z-g5SnB!gGpw(m|0hMCo}CGFSwvWxXR_eFQU69mzJYD?RqYOh@wZ?2zCvqC%yKxGubI__~GDWM9nswz=jctt=(F>Wt zb-9XP_L#u+ww%OJQTe+=274HiM~Ug41*X-iR z($-d%!Na27*}L%3MJcSWNHMOgbfp+c(c9~p!N?yJ=zIP(u(q{BYToDK)^OT@y3{Pp zxWIzQD@XwI0NrC;1;KbGO^-ti{2CFKV^RT8h3R5X$ah3ei>2vfRb4 zqMD2q&V7-SrR?!q9f3v zs&4*5)9(XSP^lsRFF|dTpji;f9I^ftsDf7m>x2j`Y_#>FK_w}9>_U?{ujl%Ty64|1 zWW-@WQmMT$kqmQuwE~lMuDeQ@5laWI1FZsRw|XmA!Z@?5uz#-|Cb*+?##Qnc@4@|0$cra{InGjFwYQTa}=>ddE*PHJWx$qQ88kpMCp)mi{fO)8soxw}Or#PqnN#TugY^nz ztfVC%`}D}4xA!f6{wi|=)^Z==hoq{AP}J_zeKPOtU33e5^Y%e3EF<_QSxxo z^c+cuwnh6Dntc8=R`|LQ^UI_R@Twdx)dy4DVo$uIv}wz@XW{GC3F>)9+o0?Lh(n^HZwgW!PU^Mm*GrYRHMqBG}Um;{$1roJ`Tda0n z)66Bs4g*iMb{cD{bnE283SEMP+vkNxw8heN4w&1q{*;qe<$REEUA;4_Qpo*4RF}<+ zc|7859DE_69`hB!|A*<&EckfNaWw6Cb0IlbvMQaV83|;t>dbwBsQV+9RthlPIc?(y zJn*v<;@xq68dSwt>0aUVkr6 zoXl-=K^wI(>gCW@$PaE6(c(R+X!QirdxJ zr)>nS_ES>wcqLNLz?h%^JqVOc(dwxq;AZ&)=-GnnuT zTPeTqkVHHL<0V<3K05`v?SEU#1EmQ_tRFe=yx!z+CLTmR^_@PLNOOwruC0 z^wAb3xN1bS=6rJZdrc4q&dW2`%qzfxo{mGRXy@nm*@`nhExTXHhSACoC0iicT13B$NZKW&%zp zs&)*1`GtMnFO7rXBeF6#!{QEoaR=+nC6Bkoe4W1YqnG!Q2w7*$QT>b943}>)83#4Z zl75*1oGxipkNhmn$Kqv1hG)q~D&3)h4ZE?n%RgX}se*d^$yk7DwRc;u8@l$or3;dYNqD*H zab7zfg6UKjz4@MwnJdh$W!8vrC&Az{-1R7C=?(B2leO}+3y0%mr7Zt(x^d(?j{#wB ztkMn5blwu0gv z&68~(kMJPFzBp#<#8iKkB1)Fb_k;u2=lz=JB%!(TyBvD_4JxKo=fV6%MgxI_WChRm zN{Xu*4?p&Zp`Q1Sj9)Z(1I19@kbtlr;_RtBTTty0X?pazEMJmo*yR6pA7QB4jDC3hvug-J zv(0)cmap+6aK?hPI;gt+VAloS_IIBO}ldH}4!MrL}f>|IowU(1=%Ai5i{djaks{6j`R`1H8gA(Y7bCxev z0NRks65i@y=JjJKVznJs>AB{NnXe9X!t8KM?M2i(`+#}iw|{Y5gE~vu*kn4qc6MD4 zlQ~=|-Ij9w)%0j1>+lY>*R}QNhzCcrjv%UuLi^DSJGDKEOypf+=?MY!8YFF@Dv7>Z zZwmo2hAmebuGy4idrQX=Hw+9=p7m1;`}1{Kf}qhED$aoj2JsS&P{9=TM$K=ILPEF2 zl_mMu#RT#)e7%D6F=($rmys`O0+AxGMSPOqwr*?j9o1!YS(on88 zpmfyM>whFSH6T!&CAHL|B&XM#W3oDfXUISPe!d=p-1sUepe?(9J4Bara|X0uVQaQI zIjFhIu!~)-!vAlpMVf=sVT<0Rj*(=$JZecymr9f;uxE@Vx#hL5bPq3$xPNmmd?+^v zE}NPa9rfE88{Ql2C|{9JnpnNQAFl*_kNL%pUVmTUSQP!a%LI2Db29ftu3Z)?rWTk~ zWvG1cN{@{cSdlB0l2b*(tt!qufY~vKO&5n7^4Ix=`RPb_H-MtSnTp%<@pf$u*W>G5 zO7pAYY=NS^;_=@3{b{HIRmEIDWT)z>Qm154`PoEh`9zoM=3UOmzHU>flH-pglw2RA z;Jy(f3#}9b23c7n-^59EvWb6ML%!Q>S1)UU6$UB#?rN}wmqWRe-ICjt0p;o+vYnq~HS>K`BNZz)qlEpu(A&>6|b z8Dm%zdX`%ag~8jCPhLs=_c0Wk_oTrFRyKzF-qiLD;H%Ww(VSdIZf;11^39AkKTdl~ z%}m>r>Sk*@vQ;T*tWEn%21-EL2O%U0idd+Z?^y7ehX z@$Nx)P6k4Gt6va`^T*jBc^bb{Eb$P=-lQyfEB)9HAplIsp8%NK>=x9ltM~#d$|R+A zY<S`3Svf_+ib3(|8?ubbjx#>!zboxnlFfqcfP4UesVe>{G}$rTe`KhKx_&B8PbSNTGl*FXOPjI})Tk7=Zb}jOvY$n}&&P=aJ*Xx>zRJx!ZHXErK zCTyt#LAP?f)=diFrr1fBYi3zArg15#l)SC0MJ^&|Ym5>^@9n3#(Ag~*MVC+Lur-YI z28bgt1nBLWxEiY3ETKS_yqCKcYX$Sef-T8}%%G-JmjQPY@xB9O;MZbL2^!L^1}T9) z>T)lM4(-;q-L*u4PrkEX&Y z+SGl9DXrQP7#ON?GKN`h-O-z-G20aG`n57Q1J7Na&RY#g++xWCUOJL763pM@pCJb~ zo`y+E__0=xPxV7%zza1K#F^!Z;h;>Z(T4dcob_Djn!?ym2F)VqVFud|2YH(% zEpD~=;&MkuUlq3Gx6z=PvHR~g(`H8owq}_5R7Kz*O#7P5K#G{>e64!raQgNxl}c8RO<1Y@=;*11V=UD^Hqv50gSIdAT|gqd z!&KwkM}t4Qxz`=td-#w(h}*yxe?7aIZ)~|nN{H_S_M_PG;{BF(9TI?k?H?D*3+R1B zd9?X_%7sSIAMkdB`CZF|sSEH1AVN@ebi^rC&~NmUNpa`i5*`d607;7|VAgrjDlzWR zOAY*^LXy;=FI)Ni@WfxYQTxL#gr~4+1c01EGYe=47fUHd6U48*XZk^#Usd% zpcUu+U^w2YHwuf85o_rDGN)W)N?09FmJsalJ<(|1Rt;yKCOXkZ{Ca;KV0H4f@#u8Q zvo4V64(^{gB$qM0K?vg~xkCuh|ApPl#3rKze~%RZG6=?LhH3+fh4&-iO@fXSler0B zaZld#6bPJk@{z!qxzCXdX`)LkGrI29aDU9gb;+Tw z=0KhO#!A0A=HawSwc&3$ghdjgQ%#}=&S}D)vA#f-^4N@R?V7f(T0Oyvt@I zY!-t+4t^N2!^aDf?4BMU5{>rzKuwacyd)>mwE32bwG&v?@B`vKqUepLHb7TVP}LnC-nG4mj+zAy7$Cm=iLHJfguryT(QU{k-K6HU@aEa?wC z3{R*VY0)8LHJVP@W1`>7@xe%p8mv6b@jh{ryg7g3Udx2a(JhGGpKn#SWi_BzX%T9@ z?vJ9b6tX_(oQVg`C&f3d1u;mwcdCOIPrRa!K4vHVRG(+U#t4N?lecYxkK3;+k-3yK|J5ik76G`WL4Gl#w?xdN(EJ|V3RK&&1&j3d1!l zA3Ih3IerA=$vTf#xm_quwU*wft|Q!9)3>85Z8wE~X`l1k zPBjYTEimG1*p9DT3~`RsP-~a(xcZ0dO`-rLB?^)Usxv=J zk1a|O)$=XgvTlpZivEbbWyMK29$9zGMH&q89xPMmz3=r!+hz0=o$&JdfUHS$^y+v) zE=Nvw0T%3jg_Hh)IR(~+>>Kz_%=Qmck=kq0lzryG-*Bj=`alM z%k)2*ia>87^VZx6z>?L-?k{#CKI;|e*$4a~=A+3g{4iN zC>5@kj4j$sw6r9$`L*XfYgmp0X&VJi?-VO8@|aCk$5som$}k6r_T>W$HG5KK<_imO z(KkWKURaNb`e8`r(0uPllUdy7E(@!U3rWF{1J@1MRwH9v$~1L))C)6mB;G?GScW!e z{0rmBKo~yr`q}FBQKyxyM1^a)LQt>=-@L!4)A$xbxw!+lj$C1~9i3vFUn_Bdvzi!T z{ni=N;Z7esH+Nce1}H*X;OsYl7@&2M0m?cDT#-{_WWr_^SSf26;zPeJp=K@i_Ai*c zr6gb6TMJYF2na{SN_ZO5(42(ZrGro}q+Vso#=F`8!>QlJ4w6kaIn~7k+V=X+U=HlK zos0xUUBsxa2!<1oVyoDqGCJrV44-UzW~{Uhh;8Fr0MjUg?q*KQ@r^MRpC>tq9Vw(l zt!%4omeADa%b@$jpVynWkS6K=IE;su(8zj8WaU>G)Y^aEj(u0nYOO@UO+U_V<~A?H za+IDdh%L}C+#E0ANMTDRvg0~uzIL#(vivtr=b7(hAVZM#94dr?&ohOOpuRFReftJ1KKE)xesc zfIGi`UXOz{a@;tg5zl{T!K8z6B(i^JXtABRg0aTwR`B5CzaLot*^5pZYWv&aaYAm< z#qZ-iN|OCuvCbM1Sokh`E2rsWY)n{bXSN~NXZRU5#oep#QS%g4=}s?0PE)9C#FEteO9Oajt|R{v%d;e91|JSJIHC ztHFEydUtPI898GE4?IoOuFtJDnJ0TKEtw^$r;JJ58x}Pc9Qkl|CD- zZp)Le3%vj$8&fs!`OEZVO#_?~OZV6JU6d8(jokomEB9tUpIMEJiSPm^0;Yl%;^)i? zR`V{aCFJ0lm4Tyzl!p^mXlzCk!x>EIS=1ZyMRxBp(`4LW{BhFkL3uJ!c7SD<4z>cu z16#tg0WXMN?oVsq2uC?GV(|x+iKE*6e=5!YaHK)xPvWJp6$a&8@P4!oSF!0!L%~w+ zx6k&og~qhL7C4%ViJtg`0w2DK6DQ9!pV=bI?@@)-rJQ&c^+Y7KCW+DbDF&YJ5%*+s zmx~KJg16`WrFRfPVaoyyI~S(4rmOuJEkgkpx`BrTzK6E(7|ecl=KJef=@WCs+`wcsR+@$~QI`2n$!G8cW5Ei%onR#FgY= zXyP2UD&+0JK3C=hz#xY5)7J6r_g4ovFSGcCe1VBu(*mG9Yvsl`e>k<_ z%l!$mH2y!Sr<#Cum#OD~lNA!Li)tcnu>TjQ_(@BV%u|f+L!>F;JL9{YleO4 z*|WWYSu>&`F@>q^OdOyfWC6@|rShIXaW#0abeY#gyZ0EPz2}(GsMgZG0aWW0%!uK7 zAGQ!9-+mJP?Ah$0gI>8g$Qu!xys%&nfHy`Ge=mnusB*dJj}A`4mRR)ofdlJx1?jt` ziW-QHNG&hmStmlX*$wU9h9;=#kgWDd1T^0N7C-;V)c0DvYt;6 zlumYKLEpF5YoVc`xdxxpkMbDif)*+~HupaS?6;Q_;T`Pm0{rb9H8d)&`K(OIto~7B z^^=x;c?kgTgJ7$=#GD({XlRB;*)|>*3Ru* zZxzMA#YsxKh(i7H8WsZP%_-+iylFRr@|cq^NEQ4{ZB4O{w=}UFzYK{e2ZD`uXqZv+!>aZ=#f)9CUELH(@}{Gsb_*@K zHBqɦXO{QT_f+b6?t`4krgYSVt?4!0y7qjG;i{%-DE(A7OLY32cv?`ycg|Vr0 zP|rO**h(bu#Acz`xSMw)V>l_fOs0i|sy3nBl{!>bKR>PT1|C=vZB{1!SwI-K)o*a6 zpIWHH`}yiEHnV!fi|LXgBd8Jha-Qg$WADFju2(QSTwE37w@*ej1aqoGZPKyB93{Dx z`y-NUr_AdoSdG|>=V=7=_m&5lc5OO=h4@rk~h& zKhK@h?(ElxVN*maR@G zEYElK9~id>Bep!eDtQ9^Od5ei)X^aCa~l6RHU;) zrR@nqwY1V2uWtD#wvXq*4#(G-V0twaxrAQ|DOjYY{rV;4l`i)3d>RsSS*^?ydk}h+ z8T#{{bP<7%f@4wrmzb-YJkX?*G_^Wxwbc}O6LsHBAq20`{+TsF<#TM1BK4^|>D9;J z?i(RFbP0e_OQy}7$T5t*=^qa*@DB1HU&ECmqx=j@*Lrfd?Tb*V?C>Bk+vwzAo+`^B_o&pNZjIk_eHq}hx$4b{N!Qeg0l zXEsobRunlKokU}G0#!LgX%FkWs{&6=0@0Rl$vNWHTQX1)$!wz;i=*|0$^Wc|myW_K){E}P%%i}ztX>M&{#!&`_d<%xi4so^xf9!XRR~6&RJaY; zL`I4*g*k6o_`L(kX9XFxr0o~qZo$}44iyyN_Vo=ND>8lQyzZ0YINPYT6SV5N0mq0^ z5nOEd(dJ$(=viU&=ZWmLYLv(8#hP_C_tOqKM=L$MXhvbeX?%H~k~wuariPt(+Rpv6 z9x+Hjs#70iZF578wv)QdeDo~TI;XuJYMa%)%7}UBR!5M=c-G#fmi+7^x_=;y;+`*^ z;p(6n$C;mJvXE(jet!S)OXt=Lxl2$H6X2J_VN#K}1@y6}yiPs7CPd?m5^DYhAVB@U+ak z8R`_zrggS^#>)idMUCfFoO!FpYK%woBTuUwn~8^@=nZu;M-mV=u?;}!JkioLoO5mE zFx@Lp8_3eDMa0Cf(LzRR%8gkIO-es;7*{{LOiDsH3u)mQIR5%SH7I$Hy^Qi+tn96x znLtXa)(W78bl4L1V&dMdPHgdRU(;J<$6!;KC&lSPh|tC-uq5zB;E88sjPH&q#B^rX z*ypm=YLYFP9Z*NSJ}2kr>EHkot9D|e5=6#dKS5P&@$m5M%@&g(c%?=z-`Y3}Klm$% z@V9+G^_-5;j3yGLkgq{!_701!zh#|eHNRwzHyhx5$le5xApPMet z2W=k{MZOca+iQ9dFW#`P)DkbwyL(u`Bp>@|N3?}wtjs{M>kS#>!WHclM>Hez+az#{?Xp z9sQ#Y?e$VlKsvR3*kfk}H_&HG{nmwI!bbr1k8xm2Mk&#qEWAii>q<|;(wx(? zTdSbX)wWXZ*2BW>NMUoW$x}#RbO1lX-BdjpnXX7DEUC#X)efgw-({Z zR9e22Zxti`*Bk!n*KswTtEIC5cZAGDdLG=Pe-mpv2`a0wf`?SzVpZ(UP+ zN|G%B(lU+ne{%BWO~idS*g&pU?u^IH=g*%j=<9!Y8g|aHJoJ*bJ4WQzTnRB2@MB+_ zV8o^iC!J0_le!|}_{=0;OH|7s^$)jar*}rnbHzn0nl7{VCgqlH&zc2KACq-^uLQYy z_u~Ca1*9wSY*}b}`V9)KW{%u1q^F#1dK?**u=hpX%<;k zcp$*af-aFsyUg!`06)wrU2SO^8f=}@#K$rC6-{0pFaGT=TFIVhiR$+zA3DGete&GGWZpQjbquCzns$ivAmB8R~%QuzJok}HiBw-VBP5Z>$}NnF&&Fq$MnLf z#HuWOt+Uv=4cWK+5nGYJA6~P~J)K-0m`I#a*Vn{tnskcZ>&2EZr`FA9bG2vHk6uXO z#*qFK68(}5L5M9{&7HidTJ&6vAAA=V69XE)5KCD6LM7m#`ZfDtS5d@|aelB8{_3Pu zODsY~?W@53f4hKF1M{KaZ%SV;<0P8PG-BJhQw`qP%l&sbunv+<0O&*K7?r8{GZbsL zr2L5p5B?u%R~Z)78m$#U6ckiS8bwI~X=zXq>F!jzo1s%g5e22Bk?xU>8A?UEW2j+3 zdWHcchGy=@<5AB!dY}6|_uocj&z^68Ypr*^?^^3ae`&ayE&pU<{%)v2`630Ep{-@l z4sv9~pFhjK^MP7}-T6K9eu8B`Q;`IbfKs={lFC{U%}-UR!TS1c-fgItOBswtX_c|` zr)!>8Om|G1botN~fgrieZ0t|Qj<0aM`?w{zMS;+fzj1tpe*Mk1q*b@iVx)!E_jAOjnLwdVO^>vcXdXA|9zUFRbsf z=uwKiXUd-2^+!d(RF6}3yd3F4Z0ULYd)9fx@>Es(R*zN zo+5)B?!K1WdyFXVd5fs-jg1!X!JSSL?6#)IDQx!a%_v_*crRv64vAG4yKdzb-c?Y@ z2v2>f^WZOvG)Ut-XAltilUDtQThGsI)fsM%68rRT zlaArwg_sRJyL8Z}1DE;UPZhL9_$gV`{tMmtH0tPMqvhr_+;9rnJJ!Q#pFitdI)@7w ztx-wF!b`ADLxtT~)x;`*2NX`QjXD#aQc1bB!`8d^KtqxBj>79wkUCG9Fk8NOmCc>mY$FKW@DUhC0lJ>Wv8H08=W82T?V?|Hd>~mlAf_^r&M_@ zMbf^Arv~@l#FmIL-jv6kNZ1bRE;_iQO=J92g8q7c|JMzq4s4E~n1qas`KH|2PTK1E zQmMoC!MCE$&78OKzrBEScTOJ(ayv%`LW+_v0d)I2Kx+&s&bOL6aLbwOn~`I+-Dm~x zITrfzE^617Ycu<>Ci3MuAaZ?=qPH4mY;@NTc*bg()kVmGbBcwVKkX=q8~!Le*DHWli@1TCQ# zjGjgK9!v^S-s`@##}Ju&YkM_TpH0)vzHjQH8-Chx&MfqMpw$AuC2QT7Fr^Xkzv~cSl2tTO(=GP71ZHsU9$l>=9}F6| z*T@lzpXvv%$1dX^$blW~(REb;zN2?^Ydo_J!2lqI3Ve||=^Qge&@S)mRYJH4Pv3lR zm(aGNdw5SGos*6ETe{{Xg||ic;%0XQh(o0N_B6bj4G{^Bvw$IUVCjS>*Ud zt&Gw!Gu;Dny}zOdk50O2E(I{za;PCY{v?Ql&6Nb&X2QMx}252N5F4prPWfsJVMMUhD;$G|V+XFVBP z%c|=q%t5!l^uC`FNyA&atSn!eG5NByfS9NC?~UC=(${qGD#g-uRP7YMV;63ZUv>S%g8C<|yk z^=Y$87Yk^}TFb)2$Cn4W~GVZ4^EF*neRx9>}_g z7;Y-vEC&40Ae+qZr;=Pc=NUB4&(^-p)Mvz3lzzBzK;SjE=e#k|rKeV{MRuYp01QZ$ z0Rpfg6$bEJN5VZOr34Oursm==jER5K@}lkCk||~^K3k47HkTf%=&>@z3?y@Jfaa0Q zJTAD0t;{VXHzd=pU;ZWB{$sr$1Y5`vtr5pbPJmx~Gnm_}^AsrGkuA0%yIJ(`8c8~F z{O9dL=IQ)L!J9o*_xh924KNu|1}P~i53j>kMB6|tDjU#A&X1SAIXdYTZnW*8{5`@4 zWSZ7jX;DBz%7GHFix=Ral54*rZCA+a>?~fPR43PrbM1=jhS>nwXte#wos#6cpZ}hl z0*D=QE?s+zI>#tPqu)fzs|BDHN4`(mH<%wLXCQoL*Qm}+J&*twf62>P@2`OKA5l=U z4)^ZejWG;RTo(0_DgJN{yl`^~tTh~7=DYpiBq(GM(Ul(XRd#OxJwd`W_V=V1gaF6f zzWn|V^%G^qP4gFR+J-r(+h2l+UbELtG6rz&!SrySv&Z{F3aL5(+Adr5;CQ1i3bxW9 zE?5!G%DW(YRJazU+?Z^;(c`n|y1(8KUpyww@O{dFb{FTOk5rE<$gB%bh!ek?D#*O^ zzD4f_#yqD;Nc`_;@*{Rn-#zcPLV9BxWt^AkI$3lHZ~NwzsT4#r&-T2btN_5_au^lqk9SE{h>3{eV#aS+HOW<-*q=R@ z81ru@h&8pas!Y$!P%Ji4^5ZdAy*aOKk~c_%m1kw9OuzOMdN{ z;OC@YP*(>vk!4Ngbu1WHp#=U8SPrrdnlnG%-#@fop8E`8xa~(10nzr%k$aPfIPZGD zlBAJvZe}I9RRf`u)Y}?;T@hV{o@9sY&_w~6Xm-Be1OD+kzv}z<>Eb%6huZQu%cUi< zYTC);6QX}FR0ma@6XkylwVNzs9LoVR0l8>f{I(C$>~{6U%mK_2w{(*2k|M@yJ(u;o zP#>2mhYb@CfLIm_hyp0S>fVxz=ld*S?7d#Uw|9UPD_=OerW*DptvF2t#F>DZx^1li zu0uP{k1GNc+bZcn4h~LM%?$5P82-;I&&yXuj$I+!wc1P%XZnH%Doy3BHL43PcrE*R z9Rjpm`_|e`o{e;%!R?aQ@=N8pe0;}PTF7;?eu+Z!6My|8LDd-W*IqNvMHHL-<0Cwe zOixtS+6rIm+G=A#YgoZ^bij~)nru(9klALO;YM%AHjtU0b-~iYvkVfh7(IwR%;~i} zzU3NltZwAL;J`{}q?6+(Vl18@p#6&G*kBm2!46tOS$L8&0Rd-P~#<(Hny z!j-n1>W#LrGavJTSyUt)br?w%M1gR$bR?^XCPCay7p^b;SI;B$#&xKdRd^| zNnDo7<3En-+qRYTAYU42s1-X(f>g#ghd9KMGxho?C=?YLHA>U@_$O-!1Yxw8-j|i9MJtzPa{)q^vf6^82YHscXq!FQ@yt; zAjr7i5ko1_0X4=dbzjO)dI+#F&n0z!D=h-{ppov<;zFMBAa-M*rh(nB``uEv$Aln# z zCP~C6gdJkEhZD3mASa$Au6&jl+Fp(_ix1FQkG}x)o%)bzNB(Apz#lvf!`|JE-J(mEy(B*ngpE%_2lI^31!b1T zG3ANV^(M!co(o-$Hqm!s2O@NwOjpDq06HIqV3puNmt8v$R?H4c>U)?(AlT`d%+>9f zo=lU|ym8UYKtIjH35(Awp-iDY*FgvC>(^J!$<eWB)|i8B<6d&#n(1EqI~AR&diztJ@!>cO$|BV1-jGwrOjiyRqszZkc3_DYqjVFqAO&y9MzLzUGNOmdK(X+=9!+=?J7kOUINzI zYk4_n<2yO%VHj|srn21716c!lKOOxeM_E!gXUkSJ0cEZ2Wej-W>KQ)#jv5XJ2102r zwtml-e?%Py6z(^cPPUb;+<}3Hbr4rBEX+&~vAkUKAzayfqTN&+RGMw%m>&fRJi!e^ zV~eU`uO9Yaf42&>e3Z=EA4zIdC|XDJS^{F^!I-^8T3C`Bx7X$ZX5>b*KuWg%%IHAW&|M?E*pifF>c*#IT@h71v`)@#M0AOHJY zQWA^4gFtDr-y?x_`XJ%G57NfEn!zC<_P2#ATiL0|6Vsn#)T=Z~aVOsB>sC6HL+veA znKk)fhr>)S73HKumTrcJW@~gW9(e)Eao@48xYH@E8R8BOWu@u$-7e|OIn49PRA3mmu;8W}K z6+=HJO{zz+E)P@xuj(U1G>O^#kh;sQf%{0mTd-FbF0)>mGqo?c7Ud2E_I6?0SASfc zz(9SVlsGB)KU&#Oe^jOvNGhmn+5P(GKmB?&eVt?!Ks(UgedZ0I3+;sUVZ0A@m(k5_NbM_aoB?Wb338)jn%Q2SMCWuB^IHSW2h{9nXQ7&u?dJh2OMi_*)= zgt$FgBp$l+RYAyeH*c%9w0_dv){>nE)ry!0<}W>L#@U;Lcm+IU9c+z~IcWb9pw+pE z=!(yfCTRBq8F1N`*txwv&oeo9;zW{(4t{;qpU{qBf*?6U3@GB~e$ex&rQIi!KHie9 z4#?UjnE|uXix!f=vYH|i*iMQyrcNSkl_?m?`_&f0#l)m_5@Fe44sRYs?g$1Q2=KTK z;qy3m(41fyuZHCYjmoJ?N>R)(VuKXUsM)l)#YCFcH_*|dglUX~#nPb`s12d~16;imB$1Ck_ zk#)qDP3Q>}vb$2;isQBHsX;~1dZblYsp;=e4K+JZV|Hl=N;FB9fqE*9d>y5@BS^nt zwe`nD9?xk-7q1YiflOCwh+miMPTlyBHhB-Mb2VpgW$%bu7%+-bi6?c?aqa-4i6nucA z2%Z%-lyf;6!MyE?cJ`ITrnoo(Go*lRKd$)08c5^t7ivWTGaqgO+QfN|w^(Qz`T_Co#yEM5mDt`egRJ4voR2BEks#{JXJhb%+248&aXalNCyVj zjF}JewOk>!TWC)!&Rc7ilI_j)KxU7!l5w&HrO(?6w!(^yzGhHuwFr#DUvpDTL{U1j z;l^>0B#5C5R^MyzW&bC;qNtQ?RgzL`JeKw&xFC*qQgzEZRC62B7_k3@=0(Kj-ZbDP zUi;37-|wp5$|Lsee$5yLcLlV*3iNS0RLJE$U%oNiP#*>KN5$R6BYYOkK2;x$hYLOT zldd8pO8RbF)7X;%+TFw0)~`EVZ~b@)KaTF)lz(z}EL`n%j1@4^XkMo9dPYD#3&(h- z@wLmvyU!m|-+XebUHmEKL4X7w!Rk}n1wYSZ4K24yP#{$-5HW`y8u=9HZJ9(z*EP#hT zX8qwZ{$t)wM7R%~U5WZ?Z~i{I#t;Q1;YSYNq(}uzclr-b9hKJQEZ#&8*Jm=-A3YZU zURfao)OQAC431)ud39heQ9Q!F)-_UP*lz}(HA>1sQj zOCD(Q+iJ*k2JGDv+^+8u{~Wo&47|ugG1muwD2lRVnMN%ZcXg>#;P39Sd1J2~G&LNB zDKa#^-Zay(in{*cUIiEC8ML1E-KaL=Xvk}7aPVHcg@;)7A=EolWhswI!Hi5=C1nZFy0 zePUGMH=LXyapQHSk?p#TYON%1wq}aJ6}J-_Vq< z!;kHZ173RP$fg+9VRb|~-!EyCZ|+rt#a9?Brh(tof1~DIf!)BQ&U`Ti zeSh8}VmP9gT%=b*&?TzAJ92|{y2@h^(-rp;71P@v-9ji%QcK^K95RVQM^(c z#9Ru)kMpD7yeT*n_Nu_ z8iqu9g#R$dvr5wGV!L;()mIEgZM&^w55vjQCxLKHW-$i*W+!aZ;`aeL#`|5_5^+lmp zS6iEk9P<3$z9+>= zg&Q5iNkgmQQo>h2&^DM$bHa%n?a(yS?E4nUmyGI9a?|^(y zkMoWz;JcG+X|@00^(ZMuA=p4BsKCdr+t^=Sra@*;SJzOkF~_h>8mGqL*Bjfl8&}|? zi)xzrG&zl+2g}dDt}S)Qa}8|uk}u!at9uHcP@XbM^BwR-~bsZX=Jx)q@&-713X6l=$SBnhcA4J0j?GB&k?mR_)S-@)828@ z;4jqdj}ifE#uF9CBqtZNuXT?cwy=|4p-K0zhb-3>3U{`)vd5ptSKRa+;07Wb|`KM5LKI=^O%?82jRX|41=Y);xj9=KMxf+`Ff)2oQ+!UsQO zv(KYY#qG0U`ARQsysbMF4(g`Fgh$uTl%U_u>|9i76(ZeEc~?wrNP_?wMN^zZ@xV7JXjev zzOB#yXz0lM%ex6a*Qsi~!;g*_A%9XsVBhz5*WmoiO@05dBdDoBqRL8JBcg;q$g6Ou zL{+n!YPf&H__&RKUwMH;Ye&f*Ej-e$gx0s2DT|d;RtAlv@L^87+rbp7x7V!csSlP? zkPa5JJ1Zkb7m-7Ki4xeV$h4B;7o1iFmzhpE)+45{E+V@Dw>YJ?6{)qX2h!ult89v> zhOZBOhU!c04Ut&u?GLV5kV7+#N5Y5$4r>)@x$G>AqmRP)<)_#h8X5}JdDR5pI$qS_ zT`!(>Z7|9)iZ`~iSU+H8>C;9}2n>FiZ?tSoMqfV{^(E(=j>hl#&!7#)OSH3ZS+Ph+ znlL_kdHE5}Fl*0I(MvLdz&K^71T;LM4Kl%dY6cznc+x?&)kVPoEUyT1DdMV+4 zQ1HGg5iR$*qknZ~Ms+%mJ&D2Hk+=MC#9z#1c!FZ~A+am+$wYp)Xcd7CdnB@!NVj&O zJUrJD&v3G&)}s5MM||kLk)ol!+Kg}MG`>B&RHir))?0w|tuCSDKlF(Hl9L7`!#VMY zV-0nMy3|&B=vc{SM8X~->yf>spFocRz#LdQw8sA70iWWP1;hRh%;p?@b!K}72AQk< zwZ&6O$FU=h;|^S(>wFv|T>D21VwGK|5f$dyRl2q=ZThTsMHPZtwM)Sk`!4;Fbp;Sc z8NcdR6ia}kn^^V^#yN7`UI-&Y@ddPVVXjwEB=QMwqqGE<9iR$w)Bz&yJ)7WwTc~^Z zD7t_|=r+CoexrfCQ@RsQS2|ZRN#(!tjqg9EC!g0jykHw^&SxeWzb6S_15}f8Fb&&`2g%34kbA2Y zdG+nln(CEJ+HpSKe5aD~>NILEa4Pqr_ZGdwn!dzY_;}~HJux}rFrGSA=A4pYD2={; z{yx0|o0=b6JZBKUZPuP~S_g$zowu^%&pM{RzTX|!Aa<~adZCu3znFGi0{}f38*2-< zD<*P0rE|%}>B_TY0}H%T2yD{YJprh(5~}s_Zg6t*I&{30JXFkc_igZ*Gjp@EZTxB4 z^&ern;J%JKt)}hkGH)Nyp5v78(@= zg$UqSot6vxcG#;$n$L)7#c~Q_Kh8Udm$KOirAB3Gl&jpeaxzin$`=29hCr&8{yH*@ zYLB$cgINfkQWV^yCz#BnY2z_D?p zM{Ij{w>kw@ZKbCTj-bwwd$fl=)6;)RE zxw;&88xN0jp8{E|m~f@~CX;(guj5h*EMErZ(Q`CCeagpYVe;mQoJij5-*Vj1I^AkC zfLiPBA8lQA88QDui~?S1&Ti#>2zD*^Kv3A2-y^@6#nhOlpfn75|rWt2VBfob{t)6WLkO z?rLVV(E(e$Do^vPSFnX)iwiEltJ^5U6lm6m`}yxo9Tlk7mz6ubY&+*8a_pHwkd)Nn z#ZCSDr=9Nzl38eO&f5(%-OHpReyG1wSYi-Je~oduQZcM_1n)dYILC5#Ih5 z+;S!6&U0DT4UBN*o6`9BkYPQ(5rRJa{pOV?w?e4JQz2^1FeRs`=$xvos90WI>?WvHswJ8eV}3e#(|m84N8q1$F(?>!KymV;f>qYH zL^%01P>3IGrzM<-pXM#yE}NgYKOWuD!Ic(Y061Vxg2KGHIkN+|ZX3-CpS9DL!(#p1 z7!Q*`Q~u?eCnPpGFM}C|>Jf&VGLOmk~QT2f1xh)`lf0zkU>VB-`5-Wf`g- z&29iix&2ix5yZn7MY&i~^JL%LLiECoYRNz_;Zx#bcY49q4)|E`t05lN{4+Gs)`SJ0 zg>dEcn((lM?AAD}tU}V&M3lLFo7z)ZrNk`FSvjRfbQWj*hM(O=hmV7Se0zA_aDq*= zBICH)EmXj6?A_&i4izFk((xXNo7_s7WcE>&u~_(th^;J3GZ z-=^F_b8~#OqtOfq6E10c@uEJPV}fF&e7F>akCGeyiJ7La5PGgoaSSs{8e`Jj_UPWl z9(x~l(%ZDGQEL`@KVb{i*1%4|-K48u@ExGcnXDR&8CPjd7|}0;vSu=k2O-gP>Vb`5 z@x=Ov^gW5-w;?iS=BTTExk;=;=0Z z*>xL3?_O&@jkMhNhHIwG&(6kXii?y0wBZzg>X~|$7Mz){uWk665AWXnyGZW}*)BJy z^vi&C?Ur51l(sIF9qHo@kKQAW)+=aI!M|p*0GNXYsdcg235yioK+y99?TD$CD#Z+uFt1dy7yEwI0(!nPQfWMrY%~hZU_MSm zg*M~wON2pI`5|FnDx8+(;Nt1IOp)|Cp@`xkd36gLj8AV~wJyq~;CvHtO;uQ;KwdUm z>^)LPi#~nSW4#1%y<}KZt@y&sM;ZRdU8IH{veXBuBt~f^-TENw4|xY@Vw}&4i8kW6 z>x08^+4I(CYDbB2wm318OG?Wmqe)TkJj+*4{aq$0rB_-Os+H3*Opt zgYPWKBOl`&gC^Z##eDo<~XJ4SvXxC62ad*e6FSf;K%Ui7E$O6JrtGYBX*d`#an8ySH+rz z51xp!d$MfKt?UO>0m)t4PFhT|U2gr};vN)}tB{nNx1`hUy-3LmK`0)W5|0lrbQ?QZ z%wfMkZ+rv%Pi+B-5mdY=<@lE!{kwReNg5Xk%P;wqMXF0!p(iwt)tyufZz0KNKK2@& zl5o4zL)6nKxa=$fhPtn>p;f&`y4Os!3X-8cTnAn+r}mcE{q|RN7RTSdyVmS6Y)MzI zu1Gb~gW`Er`!*2rVmoa0PXsRKwZ1$fMW3_vl}kvVGot%(iA3VzYVOE&(MGR(#d_5( zw@9fo^Zf9;g5MLS9AP$Gx?a9|C6@l=lbMgy=r9WFk}3gz8ar-*ep)OiFJC=4Wd?SH zJ1(_pZ>>)!Y1G@8x6mtBtX(Eua?JhYb-W#E&Y@##c#zw!kDbZIcKQoVaNcOT;7#Fu z)$Q;7^{@3YkZIB*#e5BO=wDhyz?e7V@-)l$f%M(gUAGB4d)`+wDo`c0xEl4lGOi8taw%#?9)Cp7jzmgTV_Y1{XY_C! zYz6Im?Z)2XarQ2lJ{N!MCXkeicD=R!Qul_(7gd+OwAt1s^uJ(wPf%b=$P`v1@GFg5 zkbbIpP}};XwPDW&2O;cm)Oz6cY-pkFFqN1JffyML*J0?Z#1Vz9tT|F$shuF=SFm|Q z&H+U~E>{85?DDTWo3A4y<2Q9ip7Q&MbBar8GJYr{BHM?{IlJy!K|kCLjv=ezT4`Zr z%@?Ku8O*$E-dJB}Sur1CzZUfN?bXnyPs1skdHixT8-i0<~I0!sS=aTgk8OTxk57G?FB2K))P)0K0?ZYpm46-`THUs3p zPc}~MAQTvSVEmA!;HTJM^GeeDX~nWh0sP%X?tHy+F6poN3k?6;*5W4!$)944uS3wtmn6BS7wNiL!hDq<*6!Pc`rPn%wz8_s2*DDe#Ih3YhD^en9?( zl9db8_qW45rvHZ`&4eg6LVafDdo!kTw1Hr~T#2 zRN`M%%B38bLh*B2U-kdTkJ3WIaO<~M$F?YhF|(|X_kL>PeH#|6~>#Cx2#^~|HH;}!r%dfOUXNBB`rK< z;_W5hcfNjpd^6k2dJz9YrieOYWl^^zi=G3m7XiE7t6czxxHS+%S+|Pp*`Gsm*Jb$; zS~5p6->}`TEAH{LHJvqvxE*?C_^l+hNZ78tPkKmK89-{7b-MF44=Fb=xCJs-#ec*MR= z7ZVU|ltcO(?0Gqijv*zhQ>x{SdO1c5N5>Q3%HEqda+X3U5u^o>H2<$De(3ZG?}=@F zzI@BZMj`uD8_mc*Y5PPZz$XhE4#_$C?Ped3H|W1a4zp=YXt*c>Z@Og9VCjczP+7(0 z;TfD?U3k4#GsvsFp(^2_rlD={=-hinMHMc>74 zK3z4MP(|D^u#^M-si-`78x!ch*etV2PZ|=2V7^A9n3tNuh{m|hTJo|oaoo|ne+hz z-;L1{OjjnTK=$K!EvI#V#jD4UiRXnKXxH;poIZcG(PTrd(Y`|XlnY)Wr^_PLPs`O# z^J6)9RMi;>I2NAuOO!v_LR1p*#08o+c1gIsh^~oqXq9KsY49T~kfi?A*P)3^_bt>? z-O(WyQk=Hr#}|i9Ra+U~M6SYC_QcBNCh9IZIj zqG*6_?nvTxi<-ODpaJc8YOXQNPWAEQj5;@~;t5OfLU7}etQf>FE3@^=7Mi$lD98NX zFxU|8B}SdC4Di^?J{DV7|0V!f)Nx=Ml;GX1tA3Y{G)kP0Jk076_ExhC8B?8e8ov(Z zgDDp4nrNKj!|sZ1_I&huv?OV0V$l;&7oQ;GHRgQb@B2ssoMG=vsCF-uPg8847-V#N zX~YQQvD0JE-#Yh>{*D{89`R0**OFGEylG{6;=pR@VQaNG#Ldb&&6T&YMQ^@`8mzT- ze2?mI_r4FJ7dGL0&MA%4YD&elJL1SWT6~D4x4|^6+-0D;F+Cp;76;gW3X4;hV}es< z=x>?CX|pa#7~ayEZep{ZnCuGvi^@0&!33o5Ejj|Ai>U^z4ms$C;T}PHoofbPt?|iL z4V7u{t!b!)J`{jvw`8f`+iy158c;5**(50wvL@^m&^(JI>D3kR3 zOL?8XfQQ_6SI-u(^yH3@B>(_OFtbc@p_DJhLC%|jNb-FQoQ?Q3SVrX?ilXYVoLs@H z&$QZ}R_{ecm5Os?tQ*E^xfY@TbTNlU#}?KC5HLHT#V91?&^4u;C@b~h;Ooq<79RfZ zC*jK3Ev4gPo_Ca{jL6Ko=W=6=e&UYo9m)aMzuA@aF2Wnxt*oC4pCa)L?C>YGY4`4@ zNfX2D`EwGQ>McvY;(KXn%?6fiVDj>6D*W{P*Eyt_9aVPZBIgP^f4s?4K?Ce2iqr3n z7VC7Hcb>_;tF4<^SjcH(+uP|y^p=XNl@Kb^t#1^WjJw|wPO5)3TtZfn8$){%+7@+r zuKaiy1%iqK&lIXXP-QFm47{&LHemOYlIDg@lr2=VJ}KWOTiy7Y5opEr$j!5W^CEq! zExBLG3!$I2RVp^P<4-Z{{C?S)=SXijArCI&4S%8e#hw-=8JfDnh2LD6hq~qrpwSgw zq>H7&0*BoZ-i2B0mFYIg|D5O9^Bk~l*AG34KcQsViC>IKT|SqtALM2$QOLKxRN~CoOEMqbD;`39Vk@7;{Qmib8*tCvGr|>#NsF8UYl+yN)ht=V6#m0K8M>vhL zpv>w}M3!D`yuye!5v9yeYVQ$oUQti_Za9w$o_? z8*?#s5VeV@<8VcI2BqrG#6jw6LtO#VN_S05$m22C3^U|PBnt-{Ofm19sMXSdyePMV zGzY3`b8*hhn@>i36XRb5T_5{<7Qm^XH>3{*UbN&`++b_NoxE9%eR06AK@oMprs$ zPevO5=Zo+a4K}MMP%m;5AQLO3$YFE;CwOyZ0~GJ%4c8xA{h#_e%bUR1HWSe0a?1N} z9UtwJE^}%;NWTwgLhNWCD5J;k>B5~jgp!6^Q6h+P$#)y-{NzD|-#rwN z(5zi~Wk~|`)Labt1I(68hZ35RbB9FD^KTEo-bXg}E-#0zy$OSaIn)IdX@L=XQ>@A& z{(vqW_l)4@p854k#R%^NwHc0Ak7TR`3&tulo1#Tr&Ek}rL&(m8AMLQhI} zP*Kj_tN8~SkOyRtWx?ETfM05KK*Jcp`edlmXYa!cR5|We+CD&j7*QmhUd+Tscx!ep z)Uyc+vU8~(w@|@O=%%Jq5`N{omYT&hIjY2ncd>;6RwEShWaL+h@K;wd9pWoWsb!Uv zhUa!HY>&T?5u7Jdk(hM__$0KY+E7*1+GOF^Xlx;3#7?VT@AZjpjGW3&@k;{~+-gMi zTEs|i4p(vFFl?9kY<>=`o0MT|L#Qtt7;QB3m5Ij>joWccT$fqVW;52hUw2u z&$4rhtF9gH$RA9$x;9jXY=VyvxBb>TNNl`}{Ah2_ve-O2>!;e1_nR|FPE2WDZhBc} zm9IRPden4v|SzH^UN9MjPqQM@3RW}r_s*0TVDQ@3v zutquL&Zg^oqk1_+$ST*;rv#e4Y~!jl3#6ZlpmeAy4Eu(PS;M}V4%6Uy2nQPRaM zHhJ5{z|5lvx`Bln10<*+TE`$5gO(s`lkhHD3R(!5>P=L1jD;_pd8#)|%e#-&Wj<)& zFPYiFR1@nW&gyJ@Tua>Dy5!TSJ1gwHmaKr&rLs-dchI`n=g+VS?N$UaQu-}e>?*&G9R+lpkUJu}3D?IB9k z>Afg_U3&})u37JGkX%`~C-2|?1ZHt(r`ODMtwHCYOt`Xue9wiRWVgqtY~Xx$PL55$ ziqydnbg#3r&vz+G^TSIYqwMsm#}Oc_76y?!Y3}H~ZmPsZ5b|Sq(d>v1ZFEEHzLw-l zb0*TqoZ_fqmCFzWV>zJQT64``r5}}P5T6?35<&}{-XFZ8`lkxeHjYc*YvM#-N4>S{ zZ6o4sRn`Vx=m@(HaAa-Om1s2uwpd&|ynRcF*hP;a;JlursA-ehw64w6#KcrfICM4M zvgNVx9n;k-2OqEtgnbjSwWW)L;WMpGWr!#tN@;P^T zM=_p5af^K#AgfpeRBA-0`Q?o|TKhTmS81AxVfS0dDO}r(Xm*sNeXt*uvD$M!mA9>5 z6gl34N^IIr*yYO>mRo#8)YaI_Tk89Cg)xkj8c*>ul<%!B6<#*Y8Kux|o@$?cTHqqS ztW_kZ3tMofuyr1eZxzdfO{EoUBa$+S(1}Th=*t@$;$)vI#V@_z=+STK2*8fJevFC5 zS-|*<@mqZ1s~0??KB&8GTjg~e-M@(p_A5#kUjTi}&;2^&z77_6-Rt(YEGlxuCC$;9 z^#FASaMrqu`*`)-qB-c2S0B#AIxWrsn{lzw$1JiG7pl=kKb=C-N- zN^Ui0mBa3Eq`3y2Da^8Ww0f?ZVlCo zb6Xp{qA1A895_Y1yrpXy-e=NNa6&seE#;+jPJaA2i877>D0}nDN8OVRTl-;KuLURw4luBP5K`CZY^fiOno565T|e?-^NikMHbk zw(mgQGwzJHt(}cK=&zY2Ain64IImruKC5+%wc@3gnDkA5z5OUmGoa0sWVp_Y5$@&( z^2zZBR9DsUqN=i@-nnC7uo%I`i{@wj$eGsu>!wb=wu&6?-R=lv0DS;wRkOE`PrkR= z{9|iim@$||{F>@WMs6b*o?cD|*XOD#XSrU>Q#;WAM*Si;}R+h|)mAE3iD;yho0w)_Y? zcp%;3qAr(Q`#Co|*h&#%X$Xm9d89RYDyWS0{SB8e@p^0hF&di8e{!6F@OLVR`Yy^k zP-o)(sq0E7m=y1XNQnxIK9_97z;RoVJvvj__%+qP*FJ_js3FL!cZ0+9quCwcmrc4P zPB*xOZMz3=|B^iY`>*CsPWIp;d-X@`%it7;2b8AA1zFSm?b-fm@8Q}Eo1=->`IpE& zQ|DGSvuyJsBxQk)Gp_UIt3f}kK!q-d|a&cI4+$sfr5Y*bqyM zn%zB)<9DyHsM*w!>{8V7~sdAz-{aN3U?nY>~n!uS&!?wjRnb^qUm9kG;+DAcN z`uM?vr)p}-ZLn6zGi|kcDA4bc0~PoB{G&Ie=DlQ!dD8GwC%v^f0D7+okii z>OnvZ$?+6Z;M$_ErY=P0G&1BhSSVZYcvE1}7 zvp}~v{OP1u@kyWnsz>U-&KPhA*fTL@BZD`om%rjWXw9!j1=oG3DDXh-;S)reT(7$i}0N<*Q17v0ix#QJ-a4nMJ#P3k;_1Zu6kPKZOd@;O@H` zJbxm~yP?TaR!k}QKni@Npk39Hlccz?wcF(H)H_J+x`_pZ`3XKNeDEVK0*2IisdDqF z5YAR*yyM7MHZu43kH~y(5BpCz;Ll`o<)LUM_o=yD9Q1jYxcnalHBkE$Z(jYCeof8~ z1gBLeTnj)VPe~XrcI~RmWS^kmr&EUfyQR-DGjxU%%Mv?V>Uw&K^#~+%f2^?FPzg`x zMpd%7=hL62S|+(Wa)jLPTJ$i1i}4%UIzmRr;5LnDjkYQsAA}xhRr81K@d1>tZY`4a z7hjfaf0}sbL#T$B7`xTb`o#8ZygJ|f-sSO9S76rTk->?cOHqxE5T7v2K2kgBT~k|+ zTKdt+sxpO3G#uF%j?bm;lAgvR=5l;`Yw#I+GL6=LQg-ISot@3`heR0I7fWz2inR4t zE49UuDALK@eS+(b@(kjHrso5v-Z>1wJ%RdKL{?`$7~7+p2e9+`9@_=`-d}r#5nXv; zZOFbZiufWVN3y5U!=OBJ>*vWoCnc8UGtq2k=LmY@dCxxmc@&ch0dCQnb6v`zfF)+l zO)F1D%G5Zqph|l(ldukvZQA<9eNj<+jFAfaiNV~$47m?^P}n`cvs541_$D{klJzQw z;Vp;L)s;zAmQ{R(7yb!aB4?|SrDqzGH~stmTdRxI$6jfk`7`cPI@xh zC-a_x+DF&l$GGJ6Y1aw{sBRE&xgC8ecx(M;m|pk_bASXrQ0KpBgYmrkMooQCVVt_v zTfUafDZPu}I~4?|faQ^IgMJ%#R36~@+CCiVYk3d+l(q`n7errGMyl1dl1rPqb&|G(fDG^8!1xXbUyAoS>UOJ9g5y&iM*6YlcPY zG)j?|i1LnWKYD}}ukr+n)t3_}3Cw{({DdnC!d5J&jx%tD>th&KlswW;Us!5(f7Vy{K{2J5=U!=8qlKSJ#&~>So#03_n&c1 zZCf8Gysg-=0V>r3hzN-E8r+B|2uPQ%AiW7lhXghok*)$F9YI8T?}VVzl-_%SgdP%v zgai_jyepoqqQ{)){cwNxe%apwD_LvJF-QH6GOJ2#Mpd9&`0TJTYjBRdfuPf{F#+LH zz^()Fc~1f=QN4%w+-r-2IZnG~+6_kxjB|o>;_Rsd3__sKvd)g8D_8H(N}{AI?Rg|b z!sk*ypTNyRcrRvX-Hh^?b#I;BiPO6R?g-bW#iJEYy$Wa``^8^Z6`#HNnullZ$&mAC zA<(hVYrg`1)BAy8sZZVU6RON&;u|X}#d53Oi?Ys z&d#=Tmah{&>3tzXGy9r88Hl+yKpZHhrB;aOf(zaWQ=}7b(0p$wZC8|`RUHgFK)kp7 zcfM97XPuA|vfehuD$sZ*XUE2k6?Li#(5a`GGG3+Sy0*W-xa``A?2gi1xS>5j;i4$E zAUp15a(;KirfCpo+FPK+?P*>?{;tz=s8@O|PxIG$m#H6zTaAL8>UdChLSfx6>ww6O z`0NJAFU04OojqjVDxvG5|5eXWy~44D*wG+1j*{U!VW~`XcLW_BC-5Gq# z{A@l*%I~sizkEW`{YG&UsQ0x4-zm+`kZ#=74NbSQI;vUR5Y%jw!pWSu1d}_7hw7 zHof#vI#4oZ&T`Gy4HuhmaiYw~x2`s=C>lBm)a6JCM(0m&oLM+eK)jB&7V3N4IezjO z*!9iLs6;8xzW#n1{E)b|wXdcw$;?aR1<+=y8;>ujSRf&QvRm*ZP%`-V&@w+Kr;tGz zv4HW2{H9;%^oir{i_MO>k)9+bwizM*x0I$0&qwZrBXe6-9`>FSUKK#|hFIRB@ycrJ z3G~!frlOM`EU!+oTf+8^kp*X6yWhoUOR~&Us5~GiJ6d!<$f+m5xAmeAb;tZ>zvG0P zs`FPYZvtGfG%$_I7i3DOVpmcUMXjFHdfMtwxg9!uq#Tq|J26mZHNI_tw@P;78LV$Z zoRt-j%jB@^>(>#5MB^lO;H-E$)$pZL)#dMZBMPLW=dlv%LupqHR)*G(@di;^Z$AW+ zKc19^B-moZE!YUqfsT=t1hQhlvV}m-E|`ALcc_^ zOk;L0vP*9LXvAwf`*7+(@ubRq?K`*BZ#|1UH}|F2g6KzKEM4m{+CdOQRWUzG@OL6b zL4l&$$Cq^4@Vb35`f}v96f2Xiz4%ly3rO?QVUGX`b`ychZI@nnY#L>-nv;{`QF+Y% z*m(X`@2#M&Ue;R?N;3j zynn$iT6_;tQ%QxAEX0sy&9&&q?cw!$HWp=F3kV(s1XA=wMGE`H53HlSr&6%@4f8mG z^XrW|9el8ZQIo!%xHk^7lhF-*^}+2a%B5yV*|r~HqC(Cw8^@j4;Y66Hpm>aTswm<> zrE9BEj$J7WKZx9ZjK?}FVO9R)%(LzySxc*7Bk$$-mpbC0m*V-s_X>{ttvRP~VZqA9 zNYf$TgKjI!*Zhs_+gum1G33PgeT0d-%JK= zU%>gInUj@T*}r<5LqaF|P{T)g>w>e1lxP>L2a@2kMN`k*QoZgzESB4AojL1uK0D&z zwcBaf0&z-(>IaNV{)6N}F$lyg5=`-5Uv_<#;>?GV+y*CQX>j#ziP!eOPX1_`4>D0t zK9p|_a&jCNT`x$l)8!>>ol}k?l5SrCk4yS^6M{Omb7KqzH%fBsy1IoPBiXy_2>mIX(f3w$hx&B`kHWhu4s z{$Q?YGrSUfJ)vAA-m|U;US!IB8$sSk!W!Qf2a@*{zU=Rw7L0NOJt#}%z31K(oZeZKPvcG zJ~rOnb;h@p`9}H=8xZRhsM_J+$*(}oOIUTnD5Xv4+i=s%J?_qicE%Bu2F)yUKFu!aBOeEudmxF+m+ptvJhZ48gaGF)-x>j3LbJCY?*=y) zD>PG$`{1_s{lRh5djV-TCL|iw?XjQpgQWpnb21GW3X1x29~2-U&-f~LxzFuhH=U=bJk-4izx@z37 z_CZ{JQNT=Y<98MRN1)dEdjflb#FMLJ?}5^^L*)poolA!`3a>F){I#a1PR!S8-og`$ zkwz`}S2k|&CQQn#M%mH)$rn=QS!Ldz{Ish+e*D8PJt6VxgrD1p%CoYC2XuuFv!+K%KK;=&*c4&U`Ir8mCS&Mj?y~dWV)yk+jQd$Y2fXhp-YcpRu zDlr>eJBr|!tDtrZ0HZ%27Q^6PkU~=*w>Q{F@kay(Uzv!^gTbuNbl-~s$VX)2J?1>X z|1s%Y8V=87m~Sg`_HSIp<{tLJ1+rr8bB)Y>2rM~ki6 zIIo^~d?_j2M~|40Pcjri73?uBP`VeSjwNgGTdGCmu$qsp%5Ao)>d64+y8iVpo?_?O ztnMOpJrnppo`3qiz!T)X{N(ri2MU`;wFHiy2CwBs3W|qmT0FnqKv>omJAj!I$@FJp ze$g%TWRF=Qa`~)C(!?iDGrs%`kx1331ACU)cDhNAR!0_g1Mw`a6TtNV6J$Yh%Shke)Y_!nr60MP$VfCT)p){7E+kQWgZG}aqx6F%$ zc2T%5+|o~qdqk*2OPibNi+SsxO!fl3m_ignko?Ql3QR+mKIXtC+x`Q%`1MPeLNDydFVHILw}*lq z6W{6+qJS<6j~c(bhtB*Eob8`pllG=`SlqpFrM|<#jbEu#x=b)TE*m~hdeGH+^=d=@ z{Rll;uJN;x>Gl~^K>!$@RP`EZ*H^&*)%~sLuDfO91zgZ|-RSAdkxTl+6O1kPgH-t7i1Nxn zdBVYRaK8tMd8oq&P}v(%sB({w3_i07ug*xgBxYn-ps^zJg1tlNx(Y8mKXEpxYUVz3 zxjz{y)jTRU_G@b4dzHgQi=Be3>1krlp@Ptvo&GW;%M^&|E?(adaWypht$);1HMR*h zamxI!724Pstc?3&kIJ#nXAMi1H0pHic_8LmVNUgUE#JXuRhbISf~8|bN<3&8doaFm zU|rw6LSL+sS)=-&4$7+{2Vu`~ux0u9?{%QxqRMaCf!M1%;?-Qrmg6gYY2z%wOwITDv23bx8c4Lq)PoZ@{pfdQ@ zR0~Ne+`IE(p(3Ha)lFsTCxE1t z^d5DkWRraAVU+H$bDl)rN?eK?ZaH2TwM@?E@`{RSE{=KnG9T!z(nm2Le)LhP6U~f7 z9-l=aV=~>Hg*4cKwj}G1Mr9}{RK0)=Jv)1^f;sXqQ206fG{$?mru84s3c&@5Prnhbj&nb{-L&uOq+@_c1pYG zq7B!hKMwnH_r7wv$juhXjv2ure>56lf0u$2qn5wn)bzQ1`?Z?*E zyph!F#(sa5S}m9XWf0bN|Fm8vY+>O(?pumhkI4_98!d-OTWuju+Uqx4-xu$?-2+}H z3e@H!u8@q&Zn%qXbTxE&jBI=IFNB#&qcIBvGABWFY33v@WwOKqb8!L{(*{3hx_D5l zfU2pI=*ji&lH^tJ^{hJuX88Mo!=BLU3d)94ZidAXzH_oo2}>=tZHZ#kmcNgc?j-Z^ zQ2nxsr*@7{U-&)HNKd!CSY8twaCM+ zc8kTncVvmDQ(cdP2!Vqx_M}a1fiy>cWNm&~SetgE|)(4tX;)gggVZ&G*VfEu0gzhSrS&{ zxjLI(#X7@4e`VYkr%I(3oKc=xn5}L%!v@!sPM6k5IW)ohAmk}9mFZlbG_#GsHGZeB zhj9D~Cx2CH3MDk=XmHGHNtAG}A99Aj8tP;P%Iz-U)6c(G6r+^X&CV{Cng~^fDAI>} z0qvEQIkAo!c)I?{)TyD+7jIJe4etDJuDlqxQ!p-wGWwA^7-iBM{8$$7yFawnn=TM_ ze#GOy5L=;9cQ*hRq&@fha~Fm+Q=?%+eQ|Md&lv%-ONCRleXIv&%zn!dZtN{@HD)n~3aA!G{M_`ko$5dnJTSB5^#>_B zY6W=TuIE6(a@lUxvS01w_Xh!m(LiQ{i@oc&>65Q}^zg-Q6{P@qDJflB+ePcp0IUDO zIo>+}CoDTV8&zP79jNKPQvsdk5HPNNKU4$s@`ySNpHw0?w!ooFvO632y#PZN2)&Nk=0rotcrjE zP5qpQ|Ai=7YBYUeRHaNS&e-(v@%5SWjRNfXFO2&A&+6KM_Pq-Q32RsuYJNmS2is$y z|Hf`bKRiDfDM@F11?|VNJb?!a$3{sErkNEQ|T z_7x6->!z`j>3Uw0(17-?^s2sbKRiWpFvmdYBOf0y!^aLCE|-zbS)&NF;|?w=-}oO! z(D-hnrFD;ymEKcI1(nqaYiVWmsk5^)m~Zh6eU%o>@^JAFsMsNX27hxc*TQ2!_GC`F z<%hKW*C!RAYJ2ZOCZ4^=s9%1#svRP-gBWX^3i21D^|jYTx&69}Kko34298H;>VOJ2 z6oZ^YS{ujmW04ZA**)BIPO- z3SG8~f}*j2FKz=_KE|?i%!Ga#E`4J+t@ejq0^is83h9p4P0``YH!nP zmm_rfhU?Cm-NxbyoQA@8fT0()=)hHP>C&4Hsh8R4YS)Y0U%l*&xkEsiC+Zo4lh*aS zZxaO4L|CI?X&QWI-z3pz)0wY+)*F}yszrYC5n0#7m5_Lz{?}t&1C#svOl+|!=LQlEdX7!eDG|<&sK6967sVa1@hd$?ak_L z#A!7uv;0oyP|~2JfTFTo>K|LG($63DnMkKQytVw4YWd{k1ExC)v03R#N0pQm9$d;e z_ckD4*U^)Ke;u%l>n(k-&+@=p2($EJx!fE;(tEaxCPc9SYQz7#bZmcUxTk&$IsP zYxmp_%nWki4xjLc3^8b$YW6-HStg~slWw@UXqL_E9RGHspXn6n$I{jwKxxkuG|d_B zCz!HM)_R!Q${eP%LIwE)GnNVviUDtvH2;2tOkCVtX0|YJsy;mIMTVo8`Oc(QWV?SGt)%zLN@_ zC+J-Uu9rlY(c7&G;8hjPSuPtNXNuD;NVGw~QP(v!HS@;ofODRI`%Y_WTtz#HSRtxn zgs;z^c@e1%gfXC?&F%mEiFSkQLpGP5FHH+Yp3#xbs_`v+t?WZ>%b!qGW-<{Mp>CNK zxy1f6UwnUqz(0?-t($X23A{qHEpX2%5aj%M0!ACb^d~smJ3jBfF&N z?Mm>ur&WxhCiSmn;VEn26Rq-4-wW=}FqwjnQcQHnlYVFSns_>5vn(yN4#UcM!qivB z&ffk+cUV|%^=4GabJw)g$%D-!lKX%E)(R&A>&K>fd>>i+k=wlR&D5sVY_r5M=Y|VI ztRo-_scYu-M_XPt=AgZ}PNzaTOt<@h3YYUPVna@WL(2paTTP!l_ex~Bj%H(@91&K>>gkl8jR)T<-k#KuCUqG<(I$T+G4)DFWDk`~m#520P<%6f`>2W`+ zm_xY2aYnUI{qVGu9BsOr%=5Hr2YJKVG)m5MR5~my>`a0OYD`S3FxPbGEga~+|J?(R zUc3-ke|SM+(&o;V?|VqnU3u*Lyjzn~JIIO2+SlmF!2u;MXU^lKEJb|iit$7DM~jBc zWpf?j)Fe>>C*t_7j}&YO7))5`tI59;5gGFkAu}T*cl6_1 zAiDVNyVInJe}U>X?!P)vR@4No+LFD*pEteH4>^vNH-1^I?&IuZ{0RRs*xVtR-aCXU zzzfqMZ{B=-K%W9M0#ZH%hi7Q4 zU_z1n&gv0hsBaT<(xFgv<=6#pn`Nq6qT$7smRrAH#_q&@GmZ+d)xEorCA_8)I1h0G zXv<-_^23XdQ`TLHN%iq@j~uR^{$C2hgOOOo;eIa}m(o5RI&Z3WSwq(9`Wl76kOrtU zZsB{A4gUuX_Tpq5>b$qeEvm>0Y6I)IV~uS-=q=>x2C$NrENh5>v`5?cD~W4*^jPr) zOB*WI7kPMZ>kY-d%^~40nzn+&GXVxzojosKwZ+Cp$5I9UbpoJXLuI9SA#`!Yr9Fj_ z8#rdM{Y_P+)aFZ}rd8$WC(onU4;&c}pr_!1zDg>Y+*?iQCNu%^8O~nCeg>^=G`5A> z)JIl9YWd-5auR|Y@X*6wI#MCMP@wcYzq&ZF*;aE81daIj6xXLjP|v zQs}yxn%ensZdLvmq&sQdNe7|FP*LIj3{IuQ)H4h5MSCA)_J6KwqWJqIq+JEX38*#C zILD)r#Kh}AU%Jo>HJY>VcK$@VwYk+!eOgagfLC%#LHgqVQBvZeY4vE}#rVZ={ghtBH)6$c!?Lu5w*^T6c7(H){_BxuICr zS~J$}yIij}0shdkiekM{HKM-)pR{P3qNsI&Br!FTYw6HTvowy6HOQz5P*?4jvFa}j zyh?z}5re-(Z*PHx9p+T?F(crZueQVb7@s+|f2qqaztCrvA|3Y{4=plksGHd+6hjB^ih>TnKeB*=^GWn4#sWF)v(B{Pde~&L=Wph)cdivDp&CzBtXk zk_#dYQS2y#evt7^2Ao`-dq=YNUD9ql6`AYg%<$ocnuKFNKlko4h4=wj==euDz3Ve< zz%{*#H2bv_;QTLYo*Ao@0*)!1h zR832Vp_V`E5)&7S#nn5Kf8Ov$mEF18RGz|TzWLX48Z|rxzO1s!2g>6H;(~nh- zFHFg>xO7OP=uX;Is5}mC(^mWODQ|-WR4v5R$cS}tu>CoCK3>4+MXbF&ija{e>u5FM zM+y>l!%y6;%;&-QZY(}PrJ$==6C*wce3ofh5RPunm%^Th7IYS5wzjk#^Q~Ou*mRUO zkN^e+{l@48iV4zeo-clZc}2}1v}4RMyI{IIlQXQ~vE%V*IkJ{kyy-jmi=ti&j%Zb| zyY}1*s^3CR^#TKLbK5S0%92wJ_T9$pic@>C20$RPUJ`e!a`c1{Pd#35*>fP_<7#oI z_!CWD|9WoqxpTo4T2r0PNiB7(#+7M7hXbPeDPufm-QkJc! zsXXep_CnsXC^y9`o3Qz^6ANZG+;SKy3&)Fw5Aasz^1{5@;Lj*_mjYIe)#J|k~Ks<8Bh%XIk1#-(t zM6k>aQY)JcrIq^6{Yl`fyDx{!+|U=38Q#UX zaFii(;V41w4m$yKH>G?O0z2Z;NMX;gXnitp{x9?Um+ru};+a-I|1}r1-YzH*?qq`E zG}Vu{yXx)gxHbJLY{t|?YaLWme0j{89O*@j7^~i5Vba9z2ix>4U-&6Dzqe;+Grwd! zR$H~5f4V|~R|znWeT3&(ksWVDeZ z&ZZ{;*E6{T-Ra50+sY|;_jv*e0{0QhQ4JydZ~}sPANHs5Bdv3ER|Fg#Mea>bOK$%C z+-vKjWhBpVNjaE%E1A*mJoF6M#V$O_8;b%Q%NJCYq$3D3?+TvL1H91Vpvn0tnb-KC zThP8dO#eKudcK-ohenDX%mIAuVBRY0;g+A`j4_4AwRCZXM+d*LTj0>aj!iBoyPC=I zQXz=04{QT>faHlNP^)YhwU3Ub*4937F{vk*IxfF3{G4jKNHMir_gyhnazS=>IzKDh zf=VFrO-^Q)ZiU92Surd7{J7br=@q-hm#OIs`4IFv5iw^>@oC-eC)n5rzKCb7aM!>n ze^+e44ljP$N?wV7Qz@Dso>%BZsp2+!c_(H!3F^b#{6%iOwQoSt9FbJNrU3P=%Gj9W ze-OdCu@9r}i z$KBpDD`Gv!owvZ6h_3M^UF72n<4w(vIo3-Q zKS@ZGKAM7HMlE{<2WR@)KV^b_Q|1hBbfoCBh~;4V$LGAZBTif_JH1c$aC)yrXDE_9 zG@OuI+zKNn~l}v|L(r^-)cDv@Z?3fO0M>a1f2R2T$^) z`hyK*Uyi-tf$IBR57}4~=5|4^Gxc&;Tm@qXE)kV-po^@g@y2(V4{Bed3W0TxrPQ5l z-UGT6o}mM$WGC@w+S?^1;q=umA<|^C#68r|mT3)?S0Q=Gpc;3Vna}Lcxu}D~iIk`5 zpDFO|UVYCCn$jaI^@>GFd#&ho7FmQGDX`1_?yo3-_ZvS181mq-aVdX+1{e zOyULK5eTXy>((9EWF>mKY}_B$|Cdgg>J>ACX+igkU;DtN5wWOG(O1NLp z;3MBEwSyfS>9ca0`5*^9erP(8h! zy}Gw>T%vN!{DaMKp^2gh-N=iJ=8F^+nliIcrdz75m$f7B4Ra0%71nRyMjm{4_rF0+ z0KJHY)JW`~*N|~}u#6jhRiZxJwK|QWvPewQ0^e3xBn0l45QhuKrC8BHb9!#l;%A0d zInML=N$v)htyc*~_n$En>bWwr}5p{;pgjdMIt&e&9OK3~&JJZ|`vTFijr>EWP;c zRW5~lmOIhGH0d z8hZ+u2y+v!eXTZw; z`$fzECnJiojs--;g}8l7>xz5pq?L%4uGFc6N4SPL-H%9{4u0 ze*N*f$gp>L&`_8puqHp^h0DKRMk$C`dzbcM)N5^25sl zMG-8f{`@p(eyqyAegu*bw;sGEXGI6T?1`YQo*;|U@pIZZ@--y~pco>G`bxiB4nU3r z`su5sIN3yH1KV2;SE^wT<)h_K{O{E_f$N$Sk*U$SOO}ojMxMWfSl?IS4PdcG6PHa& zMQP}h8L7*%w@jJ9V+@ib#4hOQjlDLdXAx)$^YB9ETS@$cZC+DxZEo$+%XIZ*xgm{K z&TJ5yJllJj?>QPpWv-Bh$!pLw@C=JajZrj~xv@okQQ!Mt%u;%zMInYgGR^0YTo% zzbGUW9R%}H*U`~YvbAm1)6h|*bC0JQJH6}vl8+oHQ6zUWnu6ESBM*Y!^%iMF4+L5G za)VduB?QvgB7=XUYiO)Ya{>y}5q1kLtMPRlVIJskWNL{wZg@lpun5Cdi4%8!) ztWZ08OtS{;kA4q?TrG5Ae8kwbS*;gHb8akYGF)(ypr9!;UO?&#`?2>fotW^xtzs#V7p*N|#lczF2p&=jwZz}^DY`Hs~~oq87Zmd{I5;@SRvzCDbmCt$}0 z^^6<_7Dj%GM7KTq;`Oh9^-=H-Y0$hv}th!Au(2A&qBh~_yfs0Gg(Rp z|E+Pr1!qy_DEGvn`p?%s5)7y1I56O;(3?eV1bW816Y%;#T73Khp%i~#rEGO>wPdT# zl=$6i>@t4!=X)Nb7$XumJJiP1zhyF+AD2>58564=dN;=1^U8$y>OI)V98J{e zeEgGa8lDF(YF;7~Q_95j#sH$FOYPXcO*_lf_~nL#~0U1cgoF7xf%i_&p% z*L8KtA1wMKb8}yCJNA!HzBV&a5#~K=zsSixKK$D1#gRqm&&s1QTQ%K+!hNO782W3txK0B^E>8W6CFcW|r3X;-mTc=O18xq#67% ze?@;F-98^&apPWSv}eKQWwpP5T8N(7y<4-TFYD65ucmWNa@=3-JG5W|`=jDci+%F$ z18R*cT>iMw!J#3Z9e!$RCYOyWFY^J$-Px!V>5;@+;6!T8)YR5K5iP&^=WSnV(1?9} zK@4%%v`qng#1FGa_4~|m8`cu=P&e7h` zPW$c7%4)s=jjn2}$-SX`gWfXh?&80|O!7I?)6=KQozj?#Xf;CN^U2$7$ab2H?!>2e z58e5WvHbEqspCg`=Jy#&{Ooyrrz)3`f=o50P_+f4qn{O>>hBhUd5gUZKO={Uq~&w!gN_tgGA%-OMX6o~2^4qMWbZRUrz1Po@4EI!te6 z%tw2y?JsVJNS3JS2jJ?`{y&)v8Nh97eI69pR296j1o zwBwNpo$zhP4(JLU3FRu@3y84&A13kNU*zWx3JNQ}+hq2q+vM-qv4ea@VVEA+3M`l7 z-{pSQm;dhWPipM{ab^zGxNwIMTIl*U8zl6#rNE_HiSz)wd^d$_pLouASz zMRUCfSf#U-g>+<5Aqa?Cuf3P~Nl<=BYa0;M%7={9()0Cc2=3nf^b!eNAv9}kj-nS5 z|GV&Q&dCpk3uOC>MEI^gUwc_~Pm>kiO9zv<=g4FOt5nY~n0Fspoo{OTC~{`fV_<(s zvxJ7E?28xrO^iZN`4&03vr@3_QK;PtJq`Jv4KZ8=6K^RuQG8ck+Kxhf<6rxYPe!+l z$GmAe?`g`EA`{p0whVGwpL+ihtsXX^^1#n3`1{Sdj2f9GM$ho`5ipZ-j}KweHkWz} z8dtvpj83~K z=)|I_&l)Yit~K$9w9BP+MJNgg!&K4N%uaK&t6ue77%52=BlnCvQa}`0+s-AwALH$* za4@jb8nj0=bv*g8b2KT+zn*SyK+?gSH8tEH(+`lwMzaxe&FHnQpZP%Oc@ZXF$mY~3 zmkxrt@ujyQpz^#oC;vij?j@1=yxhJ(#|uDIi2!YULAotk6ji9+yA@mzF;u8A;P8|D zrdRA#z4Ri>H9B!fe7NwBJ(pz#8W}sE9#m0KpzfD1Jioy6&@!)Ec|! zcGWXc+ConYGnS>jW~$BK*g2ISyoYneaN||t&!uzaDqLh|^BU#6x1xa5E0pczI6i(^ z75NE>R$Dr&K&M2kyCZ?D>~up&TGTMYN1_tpWYhQM_vL3dfCK`PRf?YW;M{-r8sd|4 zuo1@Z3V5?_{AvLVGFDRZe-x6@!w9|raY+Lc;sAnZ z?{jZixu`2e1)1319Tpt^;<}|Bb(03lB(KrSeO<<+(S_k7CrRNx-tzwyt1tAu|8-kOC!hBBaW|VZsRRAqcAR`0z=U)B@@~?DkhzL~&9~Z? zqTeh+FkwN5g(H_h5l%reb#!V@@(X}aUVP&LCmX?-&ASIqD?~=zLTQ!iwkOQ!M`D;>8 z6)=!sjeT_aqw!>kg`7R{RSD2dAGHJ84%Gh*$p?m}yvX9wt-z~wMEra&p#P@Ued*~Z zxh7Tq`>wQVR)GT>LN?)|1u~jCIww;;0Zm1@SKY4Z>Piqem+lhAD?!1G5$cT z-&Xqx`;`lwKi_4a==mu`Q)l{HATDaWM603Ok(c#`;SfUjMzr^3tIh<@MMp;`b~*7I zQ6nOKd8l8Kir?Qz!424xIUT(c%)&EL^JCQEC*&w;5f7;aR!tL=6PfB~YQj8wlfQjE zU2LaNJyEY73on1Y=&Nnj{Vi}p6AOW zGr+uZPM1zK>dnH%8uh9Tv_^MC!DFX^C10DH-=T#71qv;8e4KU*Xrc_pIxZ~g{pW=n zmsswWKd_aRr64qo?3M6KL#aUVAka#dFFDUEk?*Q6M4eB$kPKU_ns5F)gnB9@Vnm>U zghUW0^bGp;{8;pwYU2Uo7Ew0hTc+tt=j zvxT*vyKZxWq_G>cvf7X95Gd){25Qp6qHBUtn}FubPI=^Y2;n~NH5V)~Edhg_=gF$f zSnRh?kz5Xto3?LduYWHE$5k<$W4AK<0zDyucG|$ol~mWuHux^ZUv+P1=^KY9yDc7e znQD+}HN%`?9R%X`4(;eLpk3&kKY43UII{XYKDr2E*-Vz0AOOQE!ioTN!fUNY9|>Ez z9iFW3=#UsJF9$YwWBRNw z@9Z@cK9m*W?<9*KP+iT2McFRA(Om(W$;_9xMqjOt3P`ApaZ0RVUHTmvn~YBIwgR2a zwu5pstl=A{a#45Zr+34;o1T2*!MqD!oG!(3u7BX4E;DjwhHOqoBNsm4BWd`TD2+x3!)6>mz>MGUVe%|WMKuAf{n(2|Z)qb#WCTf_0xm7XQBPpTzF;=@1-P)q+{D}4>yS?b z!DtgtHfD%gB&seOEDY2&z_tpK*qo~d-w6A3Y`m26b#g5%&6TEjjlUf631@Pi*D^JY z+r3!SASEoQ61>>TR;jVXa1;{3s(lS_%gez5l0}YebaE;y`yhxzz>t0mt*i5R_OWJ) zFZ!gVWmXQS=t8^{$p?>Hq;~=v0@TYxJ7xwYKP~g(AnzsODOr3l%EhjkbPd4GZ-IH#he!69WSS zho^=mK$H4<-V%#Y4ovx0NK}av3|^vX*S+D7ACb4M%K-I}a*KdMlnKI@GMBY@#xUMA zZzmM9IGZIN15j`IFnB*O0Yjqrji8+&8j(Q^FEkCPvUSz z&io;CNxxo!ZK5p}rP_UysG)C6orkpit#c3LSB1Vh54v@hoS#V{`-<0=Ya86U%x5cawmb5u{#bT&_#GQ0sZOJJ2X2=52q;;wcv{5|D&F~!*UEFyU?Yq9r3{)mOAy{`uKdMfHw^$C0>Nkm#Qcqf&JW_kjDLCAr@HXsl99wd@sizcv4YdHb*1SmUVMFPcD zY-^j7`UCaBZtHILpJ#Y;)t$Aq zRL(tra^N~C5w=B?0U*q{Wj@5rv`#Zd#)%!6uOikG1suQF-C2c56CJl2dGidcfF?)F z$H+#wXAcD3mViIJV+X@W<@hgvo)FQlmn?uGFK;*=)*9%%){7Y)$d^rD0jeMU?!!!w z3@m2xvTW|;Yn>~Rh>5%U){{|2&K%Jol2&QaRuYw9x`m|h8cv${a6^Dfp}Tt8F~%{1 z!+tuZqiXn3blJyUVBUJX$7NNyE&Na&hYC38epzJS_-XHLeO!0m0);3-v`Ut5RS}%~ zX${`VkfHr9^PtvdduqZdVDyY*MC5#CxdoY7XwiyM7Jqxf9kDSJ#$Guo+a*Qjzf}W_ zTJZxK;l~;Fuan8jwV3h~$}I*grb#;4z(L6C+}X;A0da{F+Wd1u z#Mwv}Joh>Yo?SJ*i64%Bf=7=xLm5F#`Uo$5*Uq^d=qdzEu`Ph-&V32OhY3e-@dNEz zUidp&q`Ip1{E+y95$G;)H$ky-H*VR|{DZ(;AfiWWHG>gvYkB(aunv{o1`(d_%dx4(h z*tj8(Xys>lV4zjx(!hp~$4a737Ae>Gow%@xbi8=2=Li3>H8O<#L&vCFrbdzb*n?!- zy5Mj_@Q6K;f!%f($B(AI+uBp&gn+-_tlWD1aGT#I#r#stUJtiCDgqc&;d;z>TdH_( zSjbPLal)k*f8TNQrrWofc+4#KWTZGfJ6Oaj5O30NvzNx#Y~&GE5g zG7%ko{yEl`8b~IUj|^>x%n#5UtoOwFh!_8u{hs_rpxT_H->NGLj}Eg4S9{#lqZaaD zEUSBcaZo^f(gu1v<$KfN*Wctz?ciS+PKKs_V{Avk;_;{thV}DdT@_x>zWV6jEne~1 zo-lo1MrnEpAz*IGzJ;WxZ|PmnGXv@!h>`2xZ}`iHe}DczNS97tTKSXlyLY{L`HCem z|D~j@A#Mf1UpeyNX6CD|L=hqQ#!6j@dbCpsx>)T@icwq;t-yBTAqq78#w?A6T{?fO zDE+G^MMW35%r;wJN5JI_2X#NlCfe7J@}OD+D9|LCicjvdOmVbHJ3!DO{v=Z3RQ;#n zx^4Du$Iin$`KiO2imLzQt=V0Jwf^|ykjJg-*K`2Jvd?PN?uIhQljmM600GD;-`1iG zpw_fQ7UKV5p_=eH+aknKA{u1kt^NkAdl1Vk_(UJ^@?=Ft1#m!QZ^qhde_-}d-dHxP zSVv~fL9lDrF!Cv-PQueJ4mAgyUUi`xL%mIxQEwA5i z0PP{E7jOGQ)Q@+uocTx_CI>{WtcUBWtI-}F%ZMs{S(k3H-kkE+nw&qiO%&$>@u4nL zf9gLHR{Y`sn{N|%HNWJ*iQZ*QOP~b=F`Wq$#0T}gscN1WW zZ|V(0R+3u3isA#fx2K&kc-;&+%7VD5H^;az%dTB@-u-0#+D)6fv|vuz7^ z`@BrrV!9j7WOg;2W9lG;=LyQo$xHaPR?oeGGzoah<3NzTNUGS`7P(FH%%amGpqm&O zxpku*jL}q2a)YG;aQ(dTYzM>4=eY>Q?&w zp=Iw|Nn^FOGa(1CESGOa0i*Dqvr(s}QRdhIQv!lQWIjpz7n%1*{Kk*uDWI7ZPx9XR(b!Ok<={QViN51w}4@v--RnSlkzTYl)Bb+ z3B-w*?2d@YAd*Sob`P$LyOCT5_s$~9bcC&zIWTad7 zhKk*+zyF9oArce{ScnUbVPWCs@AI6kF>bT#o=t^0y2~Ik6^{Y6plYLbZ5%x1xg{wi zC8+@3cj1&~z|C{~S=Sv4K~K&|CuC)=wI+885Odm}hn1eW+<7_a^iX-MgM-?uiw7ph zj)u-1Z_;w9RFJI*=kzp_MPoOU38RxPX)Bn&yC*RJzjm%Ps0kwqSC9g=HmwfBQ3=%w zj#@6GUkUK~s1OgLK zAg4eegpj29VKC0LzxvnxwYxJryZiPX-+S}+0R=piNY-YL7f2*xTS&^(iiAv`@2)1+ zujrD@#c?wZ(XZD}*Z9irY_>F0ozj;11mw<*$26G$Bc9T;sxbj`%l1dgJA#@bo?jX5 z8;C^^i83m-(8xM4BQMRh$f}zi**UfS?enQOvWeiv zSc)lrs88^U_Ejc=0M%WL$ol=E`90ck6ofa(1(VbxrZ9htL?4K471=D@4Zq7>ibPm7p@x`0ObWE-SJbDurN(hZZq74DNz2nM z>mchlt$4YJd_1}I*fgfP@bTx;o^QI2Eu8wYuD3%h1a)Ke4N*anK~gz)hrAb(Fftw? z6wh)v>}63#OmxqN=Di8^jRPWE9F(DhhZY(k-wi*gk+WRPcPt=v3_nDX)Lk4L>>u%) ztCgAKXLRx)F>&`rsi15d_E5NS@Kd7zCw5&$OlME+wj}7(qlafEM91U;4`NiCT`^xK zi*LQVT?q3}9?b8BV`xJu@lwclMtqk8grD%VVL=QhpiUN9CY_9eGtSZzi$aErnfS@u zd!c=5je9CP@1GN7!0$d?onn+~hqMC5RkE^!uPO|`W>029zS8|+elfXO=6(w90>|zZ z%~^NMQ4+Yf#0~C4<3S+m?nU4aHc{sI>v+L=fC~s z#_Z&uNa}+&(w$d{DPTtD-1$ZrV@U+94`%k)Oi%cw=hA4$i;&g0(`0V~~fTTJITQrcI5tB z3Yk*>w&8o6sn}ypQUWN-(GVVP$Djg;p|M<7AP_~FYUdqcjNwv_lF=OV3j^zxPLJfW zP;a#VCNI4rgcFq5?M{~eX+~MQ6tY62lr)7$;+h9+(0;=3vW?2!l&+G7`A|SQ`T`wf zdFkcB`bBp}R;zF@tj{cCX8|xXC#co5mV5#B$~efAE2FQ(iqHT`A)Kt2e;EgAQuzV- z%t4X=3HYOxYoG*fl!F$fqt5FHjJRt3CEJK%P204^U-BlhF-%lMC}UAQ2$!;Xtwr#B zp#CyvExK{DZ5+hHVr_xFaZB7h7o1rZePV*OhL@UXR~o&XUrcg{4wR4qVPQ5VPtLEp zgP)28W7Yxr8bUjQj{uU;=SuG;BoeSOSbP0bhjLoNyQu3-fWQ^Vx3MiicnAa75# zv#*;uBC>7)>xBzM7mgsCnVpt_XOjHc)Z}*;GYmn8qV8T18>lD+b@yPoOgb_={IzVK zUXqBZqg2jrHy!+P%lu%u8`&pMZdB2zI&WbfPy&IpB3t+N^sMk}<$Lw9vUO8hCiaq; z&3u__J!%MPiJmB}$&#f3q(62W&nNL;JRueRJ&{U^tz5%MRQb|g&1YDIM=d%lC#9Y< z3nTOr0&lg7ZWX_98+dl^PF=!TuxRD2_6R?3=9lOLR?q@3=F&)c-g~31K*%MzHoto!Pl7! z3CT$a2@%OT*qWGI8H0g|hbN`L$SEFR_IG;Pa>CK%HT}uB#^jgW2mj%U5JblRi2({C z3}0RAS3@pIz)(Qtw@*JY`DY=*3^YG{sHO`m{&Zv@phSHJA!K(q>S?<=n90%ku(y1b zb(ZmDEDM&qUlZKzq=xB-DA4$6WG;{Z9apSB28^5E577>c(@Hiu78?x>RxTsTx9QFs zEd5*4NTJ2c)7{(5_Rluz_h53cL*dKRvd}}6@^3?^-)h16<)B-d7rH-p1TxA5v4&ml zf1v!zWK#N`#y-S+iUUaNMFD}?wtCndFo**ljJrc(^xjybeY<;AhgaTZTJ>Gc3J`%K68hx zBZe$AXY^aDu}-qGqcEh~t^0jh{kC7|$VAiBMkt2*6f@H+JFN zm?;Ulrj8B|XD`O?DQqyJ@^O%7j0xM~$Bd!bjqFWS&{Uj&rMC1iRShh0O}U~gkd2sQ z(}S`0%c-#!UBH8}U*WVzbL}#>%Yhkff}3TGqeeKG)E~5lyd%4A@`qOPqxkbqvcG1%T^e#wl3ke9kV<}3)yQ}dV!!PFK#lpa+QDf- zvI)@rM27C7cZ2Ke@}!3oKq2_fuO$9%GT@q6aRA4FzcQBX18lHBjT}TGA{-GzJ%k2K z5`Tf*J0+wU!A`-VJl1{qX{424=seVYq-!`)Jx1w37Cp#$2$L>pcCcu_k3EBS6q(2$ zyEE1uenRs?uytFXv;6!>;HTTY13dFAl1z8F8n|w zlf;^0BH>t|Dy1sPCV3~`8BaX4KTL0&Jg_~SIG7pROw2?jmBAq!Gdn}U~m!P3I4c()XJag7e5VhHYv%G$P3ZFng=Sh zE5u9&)uz|BGJb7VYZho$eMo@U{;f$aV_YVgFeIcLSsHFpY*BfrGZRsoZ^6uB$`Y+n zvuNMod05>N?veWFaN~VAH`C9;%;Loogwx7QYZhriWzk|;XkjzQUBvNSWX54;)6#m5 zv+5?GlHYW4>R{*k{bOolOJh!>TB}B@l&3910@ z{;Tlqob8^E!rmc1-(F>3WMAMQg#yh2Z=hwM$B~kd)I%OZoI{W!KZ)9irirqMHbvR@ zA#O2m@$|9D=%I5Zd`OT?z%8sUY5%Hu}(ydz3*@+(ehAr&= z7};Cd1=;?rR|i%{Sx4UEXRnS8uhr+5t57W`t$ErRTF6SLIjFfDZJ6cym3bQ$o0t{n zmF82J@Pf;?I?7*RR&eGw(e)*XYglYyqvs)yw1Fa z9WEWi-kLAfKHxr@FOTXt)bGoZS$5CXJ109y{qRr-2M;aBDWWpu9(3psCTqtZEe>^v$gL@*5F)?TkHLZTK z+g>D*mr6~hWJaZgoA&33cTosa;>q5Ryf>Mv?nhS-Z$ZbvcC{4^|!W zOx})HO26h_Aa~ci6#u3vsXp{+EkXB}Z`NUoMiOTnW~K}MiKd5>hY&O|De+c4zfLU0 zzFKKBv#oj4A0;(1yaA`7hN~hu6 z&D;X3H^vmhtkj;#X9;h^4kHd@m$w*;UuG+@v<9`*+u=2mewKPSoE~=1^jLb$;b}-| zWHjF1K@0~>2Dl!5pKJJF@uBiF-s7*k&=!&d5_$PIkG&gZJ_DcaxA}%&COwr%x)II4 zRi&t*J~p;k^C(fNrt5OnTmNe8JGnmj zy|i``wq{>2(pYXmX
Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:
wget http://images.cocodataset.org/zips/train2017.zip + + + ShareGPT4Video (Video) + ✅ + ✅ + + git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video + BurstGPT @@ -231,7 +239,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct ```bash vllm bench serve \ --backend openai-chat \ - --endpoint-type openai-chat \ + --endpoint-type openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ --dataset-name hf \ @@ -246,7 +254,7 @@ vllm bench serve \ ```bash vllm bench serve \ --backend openai-chat \ - --endpoint-type openai-chat \ + --endpoint-type openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ --dataset-name hf \ @@ -612,7 +620,7 @@ vllm bench serve \ --prefix-repetition-prefix-len 512 \ --prefix-repetition-suffix-len 128 \ --prefix-repetition-num-prefixes 5 \ - --prefix-repetition-output-len 128 + --prefix-repetition-output-len 128 ``` @@ -687,4 +695,31 @@ python benchmarks/benchmark_serving.py \ --endpoint /v1/chat/completion ``` +### Videos (ShareGPT4Video) + +Start vLLM: + +```bash +python -m vllm.entrypoints.openai.api_server \ + --model Qwen/Qwen2.5-VL-7B-Instruct \ + --dtype bfloat16 \ + --limit-mm-per-prompt '{"video": 1}' \ + --allowed-local-media-path /path/to/sharegpt4video/videos +``` + +Send requests with videos: + +```bash +python benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --model Qwen/Qwen2.5-VL-7B-Instruct \ + --dataset-name sharegpt \ + --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \ + --num-prompts 100 \ + --save-result \ + --result-dir ~/vllm_benchmark_results \ + --save-detailed \ + --endpoint /v1/chat/completion +``` + diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index c62934ed94..e1a856026c 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -293,6 +293,41 @@ def process_image(image: Any) -> Mapping[str, Any]: ) +def process_video(video: Any) -> Mapping[str, Any]: + """ + Process a single video input and return a multimedia content dictionary. + + Supports the following input types: + + 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key + containing raw video data. + + 2. String input: - Treats the string as a URL or local file path. - + Prepends "file://" if the string doesn't start with "http://" or + "file://". - Returns a dictionary with the image URL. + + Raises: + ValueError: If the input is not a supported type. + """ + if isinstance(video, dict) and "bytes" in video: + video_bytes = video["bytes"] + video_base64 = base64.b64encode(video_bytes).decode("utf-8") + return { + "type": "video_url", + "video_url": {"url": f"data:video/mp4;base64,{video_base64}"}, + } + + if isinstance(video, str): + video_url = ( + video if video.startswith(("http://", "file://")) else f"file://{video}" + ) + return {"type": "video_url", "video_url": {"url": video_url}} + + raise ValueError( + f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501 + ) + + # ----------------------------------------------------------------------------- # Random Dataset Implementation (Synthetic Data) # ----------------------------------------------------------------------------- @@ -451,9 +486,10 @@ class ShareGPTDataset(BenchmarkDataset): skip_min_output_len_check=output_len is not None, ): continue - # TODO: Also support ShareGPT4Video. if image_path := entry.get("image"): mm_content = process_image(image_path) + elif video_path := entry.get("video"): + mm_content = process_video(video_path) else: mm_content = None if enable_multimodal_chat: diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 3532a083fb..f4fbfad2d1 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -281,7 +281,7 @@ def process_image(image: Any) -> Mapping[str, Any]: """ Process a single image input and return a multimedia content dictionary. - Supports three input types: + Supports the following input types: 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key containing raw image data. - Loads the bytes as a PIL.Image.Image. @@ -321,6 +321,41 @@ def process_image(image: Any) -> Mapping[str, Any]: " or str or dictionary with raw image bytes.") +def process_video(video: Any) -> Mapping[str, Any]: + """ + Process a single video input and return a multimedia content dictionary. + + Supports the following input types: + + 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key + containing raw video data. + + 2. String input: - Treats the string as a URL or local file path. - + Prepends "file://" if the string doesn't start with "http://" or + "file://". - Returns a dictionary with the image URL. + + Raises: + ValueError: If the input is not a supported type. + """ + if isinstance(video, dict) and 'bytes' in video: + video_bytes = video['bytes'] + video_base64 = base64.b64encode(video_bytes).decode("utf-8") + return { + "type": "video_url", + "video_url": { + "url": f"data:video/mp4;base64,{video_base64}" + }, + } + + if isinstance(video, str): + video_url = (video if video.startswith( + ("http://", "file://")) else f"file://{video}") + return {"type": "video_url", "video_url": {"url": video_url}} + + raise ValueError( + f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501 + ) + # ----------------------------------------------------------------------------- # Random Dataset Implementation (Synthetic Data) # ----------------------------------------------------------------------------- @@ -474,9 +509,10 @@ class ShareGPTDataset(BenchmarkDataset): skip_min_output_len_check=output_len is not None): continue - # TODO: Also support ShareGPT4Video. if image_path := entry.get("image"): mm_content = process_image(image_path) + elif video_path := entry.get("video"): + mm_content = process_video(video_path) else: mm_content = None if enable_multimodal_chat: From c32e6ad1f63631fd8033f0cca3a35d5e48ccfc7f Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 19 Aug 2025 20:39:28 -0400 Subject: [PATCH 410/932] [Quantization] Bump Compressed Tensors Version (#23202) Signed-off-by: Kyle Sayers Co-authored-by: Dipika Sikka Co-authored-by: Michael Goin --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index 3c3ac0abf5..365457436f 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -39,7 +39,7 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.10.2 # required for compressed-tensors +compressed-tensors == 0.11.0 # required for compressed-tensors depyf==0.19.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py watchfiles # required for http server to monitor the updates of TLS files From 0167efe20d3d2280c3da6aea94a6f59afec5099c Mon Sep 17 00:00:00 2001 From: 633WHU Date: Wed, 20 Aug 2025 09:25:59 +0800 Subject: [PATCH 411/932] [Core] Optimize scheduler request removal for single completions (#21917) Signed-off-by: chiliu Signed-off-by: chiliu Co-authored-by: chiliu --- vllm/v1/core/sched/scheduler.py | 14 ++++++-------- vllm/v1/core/sched/utils.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index b3defa4431..f9a7e21014 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -25,7 +25,7 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput) from vllm.v1.core.sched.request_queue import (SchedulingPolicy, create_request_queue) -from vllm.v1.core.sched.utils import check_stop +from vllm.v1.core.sched.utils import check_stop, remove_all from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs) from vllm.v1.kv_cache_interface import KVCacheConfig @@ -872,9 +872,7 @@ class Scheduler(SchedulerInterface): # Remove the stopped requests from the running and waiting queues. if stopped_running_reqs: - self.running = [ - req for req in self.running if req not in stopped_running_reqs - ] + self.running = remove_all(self.running, stopped_running_reqs) if stopped_preempted_reqs: # This is a rare case and unlikely to impact performance. self.waiting.remove_requests(stopped_preempted_reqs) @@ -1000,7 +998,7 @@ class Scheduler(SchedulerInterface): else: request_ids = set(request_ids) - running_requests_to_remove = [] + running_requests_to_remove = set() waiting_requests_to_remove = [] valid_requests = [] @@ -1013,13 +1011,13 @@ class Scheduler(SchedulerInterface): valid_requests.append(request) if request.status == RequestStatus.RUNNING: - running_requests_to_remove.append(request) + running_requests_to_remove.add(request) else: waiting_requests_to_remove.append(request) # Remove all requests from queues at once for better efficiency - for request in running_requests_to_remove: - self.running.remove(request) + if running_requests_to_remove: + self.running = remove_all(self.running, running_requests_to_remove) if waiting_requests_to_remove: self.waiting.remove_requests(waiting_requests_to_remove) diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py index 42ec95091f..42d3e5c68b 100644 --- a/vllm/v1/core/sched/utils.py +++ b/vllm/v1/core/sched/utils.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import contextlib from typing import Optional import torch @@ -7,6 +8,38 @@ import torch from vllm.v1.request import Request, RequestStatus +def remove_all(lst: list, items_to_remove: set) -> list: + """Remove all items from a list that are in the items_to_remove set. + + This method optimizes for the common case of removing a single item, + falling back to list comprehension for multiple items. + + Args: + lst: The list to remove items from + items_to_remove: Set of items to remove + + Returns: + Either the modified original list (for single item removal) or + a new list (for multiple item removal). Callers should use the + returned value. + + Note: + For single item removal, this modifies the original list in-place + and returns it. For multiple items, it creates and returns a new list. + """ + if not items_to_remove: + return lst + + if len(items_to_remove) == 1: + # Fast path for single item removal (most common case) + item = next(iter(items_to_remove)) + with contextlib.suppress(ValueError): + lst.remove(item) + return lst + # For multiple items, use list comprehension + return [item for item in lst if item not in items_to_remove] + + def check_stop(request: Request, max_model_len: int, pooler_output: Optional[torch.Tensor] = None) -> bool: From d46d417b5897d7eddb002b61b19e8cba029c3dda Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 19 Aug 2025 22:18:52 -0400 Subject: [PATCH 412/932] [CI Perf] Only test bfloat16 for tests/compile/test_fusion_all_reduce.py (#23132) Signed-off-by: mgoin --- tests/compile/test_fusion_all_reduce.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py index 4c3cf6c2a1..dd31e0db1f 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/test_fusion_all_reduce.py @@ -148,7 +148,7 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module): @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("seq_len", [8]) @pytest.mark.parametrize("hidden_size", [16]) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA") @pytest.mark.skipif( From e58c5a97688750e7930f13b6fe556d9a28a5b2d9 Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Tue, 19 Aug 2025 19:32:47 -0700 Subject: [PATCH 413/932] [Core] Add torch profiler CPU traces for AsyncLLM. (#21794) Signed-off-by: Chenheli Hua --- vllm/envs.py | 6 ++++-- vllm/v1/engine/async_llm.py | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 861e4c6a1b..70068cca66 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -667,8 +667,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv("VLLM_LORA_RESOLVER_CACHE_DIR", None), - # Enables torch profiler if set. Path to the directory where torch profiler - # traces are saved. Note that it must be an absolute path. + # Enables torch profiler if set. + # Both AsyncLLM's CPU traces as well as workers' + # traces (CPU & GPU) will be saved under this directory. + # Note that it must be an absolute path. "VLLM_TORCH_PROFILER_DIR": lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))), diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 664fec31a4..342d7b24f8 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,12 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import os +import socket import time from collections.abc import AsyncGenerator, Iterable, Mapping from copy import copy from typing import Any, Optional, Union import numpy as np +import torch import vllm.envs as envs from vllm.config import ModelConfig, VllmConfig @@ -144,6 +147,26 @@ class AsyncLLM(EngineClient): except RuntimeError: pass + if envs.VLLM_TORCH_PROFILER_DIR: + logger.info( + "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s", # noqa: E501 + envs.VLLM_TORCH_PROFILER_DIR) + worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm" + self.profiler = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + ], + with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, + on_trace_ready=torch.profiler.tensorboard_trace_handler( + envs.VLLM_TORCH_PROFILER_DIR, + worker_name=worker_name, + use_gzip=True)) + else: + logger.info( + "Torch profiler disabled. AsyncLLM CPU traces will not be collected." # noqa: E501 + ) + self.profiler = None + @classmethod @deprecate_kwargs( "disable_log_requests", @@ -562,10 +585,16 @@ class AsyncLLM(EngineClient): raise self.dead_error async def start_profile(self) -> None: - await self.engine_core.profile_async(True) + coros = [self.engine_core.profile_async(True)] + if self.profiler is not None: + coros.append(asyncio.to_thread(self.profiler.start)) + await asyncio.gather(*coros) async def stop_profile(self) -> None: - await self.engine_core.profile_async(False) + coros = [self.engine_core.profile_async(False)] + if self.profiler is not None: + coros.append(asyncio.to_thread(self.profiler.stop)) + await asyncio.gather(*coros) async def reset_mm_cache(self) -> None: self.processor.mm_registry.reset_processor_cache(self.model_config) From 64ab3c7253afb8cc2008777153812109bf92d7c8 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 20 Aug 2025 10:33:41 +0800 Subject: [PATCH 414/932] [Doc] Update V1 status of various pooling models (#23189) Signed-off-by: DarkLight1337 --- docs/models/supported_models.md | 26 ++++++++++---------- tests/models/language/pooling/test_gritlm.py | 9 ++++--- vllm/model_executor/models/gritlm.py | 6 ++--- vllm/model_executor/models/interfaces.py | 11 ++++++--- 4 files changed, 28 insertions(+), 24 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 1d165fa6f1..7908e42387 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -363,7 +363,7 @@ th { | `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ | -| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | | +| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ | | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ | | `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ | | `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ | @@ -436,17 +436,17 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `BertModel`C | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | | -| `Gemma2Model`C | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ | -| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | | -| `GteModel`C | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | | -| `GteNewModel`C | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | | -| `ModernBertModel`C | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | | -| `NomicBertModel`C | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | | +| `BertModel`C | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | ✅︎ | +| `Gemma2Model`C | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ | +| `GteModel`C | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | ✅︎ | +| `GteNewModel`C | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | ✅︎ | +| `ModernBertModel`C | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | ✅︎ | +| `NomicBertModel`C | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | ✅︎ | | `LlamaModel`C, `LlamaForCausalLM`C, `MistralModel`C, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2Model`C, `Qwen2ForCausalLM`C | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen3Model`C, `Qwen3ForCausalLM`C | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | | +| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | ✅︎ | | `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | C Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion)) @@ -476,7 +476,7 @@ These models primarily support the [`LLM.classify`](./pooling_models.md#llmclass | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | | +| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ | | `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | @@ -493,12 +493,12 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | | +| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | ✅︎ | | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | -| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | | -| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | | +| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | ✅︎ | +| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | ✅︎ | | `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | C Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion)) diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py index d21987571c..17a55d916b 100644 --- a/tests/models/language/pooling/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -14,6 +14,7 @@ from ....utils import RemoteOpenAIServer MODEL_NAME = "parasail-ai/GritLM-7B-vllm" MAX_MODEL_LEN = 4000 +ATOL = 0.002 def _arr(arr): @@ -97,16 +98,16 @@ def get_test_data(): def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]): cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0]) - assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=0.001) + assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=ATOL) cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1]) - assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=0.001) + assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=ATOL) cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0]) - assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=0.001) + assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=ATOL) cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1]) - assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=0.001) + assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=ATOL) def test_gritlm_offline_embedding(vllm_runner): diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index 9e7490e3c4..3f6790269a 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -20,7 +20,7 @@ from vllm.sequence import PoolerOutput from vllm.tasks import PoolingTask from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config -from .interfaces import SupportsV0Only +from .interfaces import default_pooling_type logger = init_logger(__name__) @@ -215,7 +215,8 @@ class GritLMPooler(Pooler): return build_output(pooled_data) -class GritLM(LlamaForCausalLM, SupportsV0Only): +@default_pooling_type("MEAN") +class GritLM(LlamaForCausalLM): """This class implements the embedding model for parasail-ai/GritLM-7B-vllm. The class inherits from LlamaForCausalLM and provides a custom pooling @@ -241,7 +242,6 @@ class GritLM(LlamaForCausalLM, SupportsV0Only): prefix: str = "", **kwargs, ) -> None: - # Use full attention for pooling (this is why V1 is not supported yet) if vllm_config.model_config.runner_type == "pooling": hf_config = vllm_config.model_config.hf_config hf_config.is_causal = False diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index c425488f83..9415e67924 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -3,7 +3,7 @@ from collections.abc import Iterable, Mapping, MutableSequence from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, - Union, overload, runtime_checkable) + TypeVar, Union, overload, runtime_checkable) import numpy as np import torch @@ -641,11 +641,14 @@ def supports_cross_encoding( return is_pooling_model(model) and _supports_cross_encoding(model) -def default_pooling_type(pooling_type: str) -> object: +_T = TypeVar("_T", bound=type[torch.nn.Module]) + + +def default_pooling_type(pooling_type: str): """Set default_pooling_type decorator. """ - def func(model: object): - model.default_pooling_type = pooling_type + def func(model: _T) -> _T: + model.default_pooling_type = pooling_type # type: ignore return model return func From a634733f67b39fd9c1da1a861ba39f75efb576f3 Mon Sep 17 00:00:00 2001 From: Zebing Lin Date: Tue, 19 Aug 2025 22:57:47 -0400 Subject: [PATCH 415/932] [Attention] Optimize make_local_attention_virtual_batches for Flash Attention (#23185) Signed-off-by: linzebing --- vllm/v1/attention/backends/utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 5e6bc33183..94dd3d2629 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -464,8 +464,9 @@ def make_local_attention_virtual_batches( attn_chunk_size)[arange > 0] # convert from q_seqlens to cu_seqlens_q - cu_seqlens_q_local = np.pad(np.cumsum(seqlens_q_local), (1, 0))\ - .astype(np.int32) + cu_seqlens_q_local = np.empty(virtual_batches + 1, dtype=np.int32) + np.cumsum(seqlens_q_local, out=cu_seqlens_q_local[1:]) + cu_seqlens_q_local[0] = 0 # compute the seqlens_k_local, # basically a full local attention block for all but the last block in each @@ -508,11 +509,10 @@ def make_local_attention_virtual_batches( # [ 22, 23 ], < local-batch 6, (batch 2, starting from k[4]) # [ 24, 25 ], < local-batch 7, (batch 2, starting from k[8]) # ] - block_indices= np.broadcast_to( - np.arange(pages_per_local_batch, dtype=np.int32), - (virtual_batches, pages_per_local_batch)) \ - + np.expand_dims(block_starts, axis=1) - block_indices = block_indices.flatten().clip(max=block_table.shape[1] - 1) + block_indices = (block_starts[:, None] + + np.arange(pages_per_local_batch, dtype=np.int32)) + block_indices = block_indices.reshape(-1).clip(max=block_table.shape[1] - + 1) batch_indices = np.repeat(np.arange(actual_batch_size, dtype=np.int32), local_blocks * pages_per_local_batch) block_table_local = block_table[batch_indices, block_indices]\ From 941f56858a48e097391cfcc451c3f6d88f7cf20c Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Tue, 19 Aug 2025 20:14:32 -0700 Subject: [PATCH 416/932] Fix a performance comparison issue in Benchmark Suite (#23047) Signed-off-by: Tsai, Louie Signed-off-by: Louie Tsai Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Li, Jiang --- .../scripts/compare-json-results.py | 144 ++++++++++++++---- 1 file changed, 118 insertions(+), 26 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index 12c4ba6aa6..50431d0cd4 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -3,44 +3,129 @@ import argparse import json import os +from importlib import util import pandas as pd +plotly_found = util.find_spec("plotly.express") is not None + def compare_data_columns( files, name_column, data_column, info_cols, drop_column, debug=False ): - print("\ncompare_data_column: " + data_column) + """ + Align concatenation by keys derived from info_cols instead of row order. + - Pick one canonical key list: subset of info_cols present in ALL files. + - For each file: set index to those keys, aggregate duplicates + - (mean for metric, first for names). + - Concat along axis=1 (indexes align), then reset_index so callers can + - group by columns. + - If --debug, add a _name column per file. + """ + print("\ncompare_data_column:", data_column) + frames = [] raw_data_cols = [] compare_frames = [] + + # 1) choose a canonical key list from info_cols that exists in ALL files + cols_per_file = [] + for f in files: + try: + df_tmp = pd.read_json(f, orient="records") + except Exception as err: + raise ValueError(f"Failed to read {f}") from err + cols_per_file.append(set(df_tmp.columns)) + + key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)] + if not key_cols: + # soft fallback: use any info_cols present in the first file + key_cols = [c for c in info_cols if c in list(cols_per_file[0])] + if not key_cols: + raise ValueError( + "No common key columns found from info_cols across the input files." + ) + + # 2) build a single "meta" block (keys as columns) once, aligned by the key index + meta_added = False + for file in files: - data_df = pd.read_json(file) - serving_df = data_df.dropna(subset=[drop_column], ignore_index=True) - # Show all info columns in the first couple columns - if not frames: - for col in info_cols: - if col not in serving_df.columns: - print(f"Skipping missing column: {col}") - continue - frames.append(serving_df[col]) - # only show test name under debug mode - if debug is True: - serving_df = serving_df.rename(columns={name_column: file + "_name"}) - frames.append(serving_df[file + "_name"]) + df = pd.read_json(file, orient="records") - file = "/".join(file.split("/")[:-1]) - serving_df = serving_df.rename(columns={data_column: file}) - frames.append(serving_df[file]) - raw_data_cols.append(file) - compare_frames.append(serving_df[file]) + # Keep rows that actually have the compared metric (same as original behavior) + if drop_column in df.columns: + df = df.dropna(subset=[drop_column], ignore_index=True) + + # Stabilize numeric key columns (harmless if missing) + for c in ( + "Input Len", + "Output Len", + "TP Size", + "PP Size", + "# of max concurrency.", + "qps", + ): + if c in df.columns: + df[c] = pd.to_numeric(df[c], errors="coerce") + + # Ensure all key columns exist + for c in key_cols: + if c not in df.columns: + df[c] = pd.NA + + # Set index = key_cols and aggregate duplicates → unique MultiIndex + df_idx = df.set_index(key_cols, drop=False) + + # meta (key columns), unique per key + meta = df_idx[key_cols] + if not meta.index.is_unique: + meta = meta.groupby(level=key_cols, dropna=False).first() + + # metric series for this file, aggregated to one row per key + file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file) + s = df_idx[data_column] + if not s.index.is_unique: + s = s.groupby(level=key_cols, dropna=False).mean() + s.name = file_label # column label like original + + # add meta once (from first file) so keys are the leftmost columns + if not meta_added: + frames.append(meta) + meta_added = True + + # (NEW) debug: aligned test-name column per file + if debug and name_column in df_idx.columns: + name_s = df_idx[name_column] + if not name_s.index.is_unique: + name_s = name_s.groupby(level=key_cols, dropna=False).first() + name_s.name = f"{file_label}_name" + frames.append(name_s) + + frames.append(s) + raw_data_cols.append(file_label) + compare_frames.append(s) + + # Generalize ratio: for any file N>=2, add ratio (fileN / file1) if len(compare_frames) >= 2: - # Compare numbers among two files - ratio_df = compare_frames[1] / compare_frames[0] - frames.append(ratio_df) - compare_frames.pop(1) + base = compare_frames[0] + current = compare_frames[-1] + ratio = current / base + ratio = ratio.mask(base == 0) # avoid inf when baseline is 0 + ratio.name = f"Ratio 1 vs {len(compare_frames)}" + frames.append(ratio) + # 4) concat on columns with aligned MultiIndex; + # then reset_index to return keys as columns concat_df = pd.concat(frames, axis=1) + concat_df = concat_df.reset_index(drop=True).reset_index() + if "index" in concat_df.columns: + concat_df = concat_df.drop(columns=["index"]) + + # Ensure key/info columns appear first (in your info_cols order) + front = [c for c in info_cols if c in concat_df.columns] + rest = [c for c in concat_df.columns if c not in front] + concat_df = concat_df[front + rest] + print(raw_data_cols) return concat_df, raw_data_cols @@ -67,6 +152,15 @@ def split_json_by_tp_pp( df = pd.DataFrame(data) + # Keep only "serving" tests + name_col = next( + (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None + ) + if name_col: + df = df[ + df[name_col].astype(str).str.contains(r"serving", case=False, na=False) + ].copy() + # Handle alias column names rename_map = { "tp_size": "TP Size", @@ -181,7 +275,6 @@ if __name__ == "__main__": f"Expected subset: {filtered_info_cols}, " f"but DataFrame has: {list(output_df.columns)}" ) - output_df_sorted = output_df.sort_values(by=existing_group_cols) output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) for name, group in output_groups: @@ -189,8 +282,7 @@ if __name__ == "__main__": text_file.write(html_msgs_for_data_cols[i]) text_file.write(html) - if plot is True: - import pandas as pd + if plot and plotly_found: import plotly.express as px df = group[raw_data_cols] From 1a3079a15e5c8ae2790a1897f82e5af0d68a6921 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B8=B8=EC=9E=AC=EC=9D=80?= Date: Wed, 20 Aug 2025 13:02:50 +0900 Subject: [PATCH 417/932] chore: support pytorch format in lora (#22790) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: jaeeun.kil Signed-off-by: 길재은 --- vllm/lora/models.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index e6b19d4748..3072047a26 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -207,6 +207,7 @@ class LoRAModel(AdapterModel): """ lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors") lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin") + lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt") new_embeddings_tensor_path = os.path.join( lora_dir, "new_embeddings.safetensors") new_embeddings_bin_file_path = os.path.join(lora_dir, @@ -255,9 +256,10 @@ class LoRAModel(AdapterModel): check_unexpected_modules(f) for module in f.keys(): # noqa tensors[module] = f.get_tensor(module) - elif os.path.isfile(lora_bin_file_path): - # When a bin file is provided, we rely on config to find unexpected - # modules. + elif os.path.isfile(lora_bin_file_path) or os.path.isfile( + lora_pt_file_path): + # When a bin/pt file is provided, we rely on config to find + # unexpected modules. unexpected_modules = [] target_modules = peft_helper.target_modules if not isinstance(target_modules, list): @@ -279,7 +281,10 @@ class LoRAModel(AdapterModel): f" target modules in {expected_lora_modules}" f" but received {unexpected_modules}." f" Please verify that the loaded LoRA module is correct") - tensors = torch.load(lora_bin_file_path, + lora_file_path = (lora_bin_file_path + if os.path.isfile(lora_bin_file_path) else + lora_pt_file_path) + tensors = torch.load(lora_file_path, map_location=device, weights_only=True) else: From f72902327246bc68ff0d196a89cc81262f46de1b Mon Sep 17 00:00:00 2001 From: Zhewen Li Date: Tue, 19 Aug 2025 21:09:27 -0700 Subject: [PATCH 418/932] [CI/Build] Also check DP in benchmarks throughput script (#23038) Co-authored-by: Simon Mo --- benchmarks/benchmark_throughput.py | 4 ++-- vllm/benchmarks/throughput.py | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index c51b579686..c7f290e1eb 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -597,8 +597,8 @@ def validate_args(args): # https://github.com/vllm-project/vllm/issues/16222 if args.data_parallel_size > 1: raise ValueError( - "Data parallel is not supported in offline benchmark, \ - please use benchmark serving instead" + "Data parallel is not supported in offline benchmark, " + "please use benchmark serving instead" ) diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 0c19fa6dcf..f022a55e62 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -434,6 +434,14 @@ def validate_args(args): if args.backend == "mii" and args.tokenizer != args.model: raise ValueError( "Tokenizer must be the same as the model for MII backend.") + + # --data-parallel is not supported currently. + # https://github.com/vllm-project/vllm/issues/16222 + if args.data_parallel_size > 1: + raise ValueError( + "Data parallel is not supported in offline benchmark, " + "please use benchmark serving instead" + ) def add_cli_args(parser: argparse.ArgumentParser): From de7b67a0232e35ae8e8ecd944aeddfc8cbc02631 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 20 Aug 2025 13:06:42 +0800 Subject: [PATCH 419/932] [CI/Build] Sync multimodal tests (#23181) Signed-off-by: DarkLight1337 --- .../multimodal/processing/test_common.py | 10 +++++--- tests/models/registry.py | 24 +++++++++---------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 0fdc182b9e..8aa0dc7e8e 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -275,16 +275,17 @@ def _test_processing_correctness_one( "google/gemma-3n-E2B-it", "zai-org/glm-4v-9b", "zai-org/GLM-4.1V-9B-Thinking", + "zai-org/GLM-4.5V", "ibm-granite/granite-speech-3.3-2b", "h2oai/h2ovl-mississippi-800m", + "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", + "HuggingFaceM4/Idefics3-8B-Llama3", "internlm/Intern-S1", "OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL3-1B", - "HuggingFaceM4/Idefics3-8B-Llama3", - "HuggingFaceTB/SmolVLM2-2.2B-Instruct", + "Kwai-Keye/Keye-VL-8B-Preview", "moonshotai/Kimi-VL-A3B-Instruct", "meta-llama/Llama-4-Scout-17B-16E-Instruct", - "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", "llava-hf/llava-1.5-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf", "llava-hf/LLaVA-NeXT-Video-7B-hf", @@ -315,10 +316,13 @@ def _test_processing_correctness_one( "Qwen/Qwen2-Audio-7B-Instruct", "Qwen/Qwen2.5-Omni-3B", "Skywork/Skywork-R1V-38B", + "HuggingFaceTB/SmolVLM2-2.2B-Instruct", + "stepfun-ai/step3", "fixie-ai/ultravox-v0_5-llama-3_2-1b", "openai/whisper-large-v3", "omni-research/Tarsier-7b", "omni-research/Tarsier2-Recap-7b", + "mistralai/Voxtral-Mini-3B-2507", ]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) diff --git a/tests/models/registry.py b/tests/models/registry.py index cbdc9edbbc..28fe906316 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -215,9 +215,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124", trust_remote_code=True, is_available_online=False), - "HCXVisionForCausalLM": _HfExamplesInfo( - "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", - trust_remote_code=True), "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b", trust_remote_code=True), "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b", @@ -298,8 +295,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"), "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"), "Step3TextForCausalLM": _HfExamplesInfo("stepfun-ai/step3", - trust_remote_code=True, - is_available_online=False), + trust_remote_code=True), "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct", trust_remote_code=True), "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B", @@ -405,22 +401,24 @@ _MULTIMODAL_EXAMPLE_MODELS = { hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"), # noqa: E501 "Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V", - is_available_online=False), # noqa: E501 + min_transformers_version="4.56"), # noqa: E501 "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m", trust_remote_code=True, extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501 max_transformers_version="4.48", # noqa: E501 transformers_version_reason="HF model is not compatible."), # noqa: E501 + "HCXVisionForCausalLM": _HfExamplesInfo("naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", # noqa: E501 + trust_remote_code=True), "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}, # noqa: E501 min_transformers_version="4.55.1", transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501 + "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1", + trust_remote_code=True), # noqa: E501 "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", extras={"2B": "OpenGVLab/InternVL2-2B", "3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501 trust_remote_code=True), - "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1", - trust_remote_code=True), "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501 trust_remote_code=True), "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501 @@ -464,9 +462,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { transformers_version_reason="HF model is not compatible", # noqa: E501 extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B", "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501 - "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True, - max_transformers_version="4.53", - transformers_version_reason="HF model is not compatible"), # noqa: E501 + "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", + trust_remote_code=True, + max_transformers_version="4.53", + transformers_version_reason="HF model is not compatible"), # noqa: E501 "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501 extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501 "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct", @@ -496,8 +495,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { min_transformers_version="4.55.1", transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501 "Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3", - trust_remote_code=True, - is_available_online=False), + trust_remote_code=True), "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 trust_remote_code=True), "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"), # noqa: E501 From 8fd920924c8c13fb757c324f9e73c70d2d5f3029 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 19 Aug 2025 22:50:29 -0700 Subject: [PATCH 420/932] [BugFix] Fix stuck stats/metrics after requests are aborted (#22995) Signed-off-by: Nick Hill --- tests/entrypoints/openai/test_metrics.py | 95 +++++++++++++++++++++++- vllm/v1/core/block_pool.py | 7 +- vllm/v1/core/sched/scheduler.py | 9 ++- 3 files changed, 106 insertions(+), 5 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 9107d08983..ff2e7004ff 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - +import asyncio import subprocess import sys import tempfile @@ -294,6 +294,99 @@ async def test_metrics_exist(server: RemoteOpenAIServer, assert metric in response.text +@pytest.mark.asyncio +async def test_abort_metrics_reset(server: RemoteOpenAIServer, + client: openai.AsyncClient, use_v1: bool): + + running_requests, waiting_requests, kv_cache_usage = ( + _get_running_metrics_from_api(server)) + + # Expect no running requests or kvcache usage + assert running_requests == 0 + assert waiting_requests == 0 + assert kv_cache_usage == 0.0 + + # Start some long-running requests that we can abort + tasks = [] + for _ in range(3): + task = asyncio.create_task( + client.completions.create( + model=MODEL_NAME, + prompt=_TOKENIZED_PROMPT, + max_tokens=100, # Long generation to give time to abort + temperature=0.0)) + tasks.append(task) + + # Wait a bit for requests to start processing + await asyncio.sleep(0.5) + + # Check that we have running requests + running_requests, waiting_requests, kv_cache_usage = ( + _get_running_metrics_from_api(server)) + + # Expect running requests and kvcache usage + assert running_requests > 0 + assert kv_cache_usage > 0 + + # Cancel all tasks to abort the requests + for task in tasks: + task.cancel() + + # Wait for cancellations to be processed + await asyncio.sleep(1.0) + + # Check that metrics have reset to zero + response = requests.get(server.url_for("metrics")) + assert response.status_code == HTTPStatus.OK + + # Verify running and waiting requests counts and KV cache usage are zero + running_requests_after, waiting_requests_after, kv_cache_usage_after = ( + _get_running_metrics_from_api(server)) + + assert running_requests_after == 0,\ + (f"Expected 0 running requests after abort, got " + f"{running_requests_after}") + assert waiting_requests_after == 0,\ + (f"Expected 0 waiting requests after abort, got " + f"{waiting_requests_after}") + assert kv_cache_usage_after == 0,\ + (f"Expected 0% KV cache usage after abort, got " + f"{kv_cache_usage_after}") + + +def _get_running_metrics_from_api(server: RemoteOpenAIServer): + """Return (running_count, waiting_count, kv_cache_usage)""" + + response = requests.get(server.url_for("metrics")) + assert response.status_code == HTTPStatus.OK + + # Verify running and waiting requests counts and KV cache usage are zero + running_requests, waiting_requests, kv_cache_usage = None, None, None + + for family in text_string_to_metric_families(response.text): + if family.name == "vllm:num_requests_running": + for sample in family.samples: + if sample.name == "vllm:num_requests_running": + running_requests = sample.value + break + elif family.name == "vllm:num_requests_waiting": + for sample in family.samples: + if sample.name == "vllm:num_requests_waiting": + waiting_requests = sample.value + break + elif family.name == "vllm:gpu_cache_usage_perc": + for sample in family.samples: + if sample.name == "vllm:gpu_cache_usage_perc": + kv_cache_usage = sample.value + break + + assert running_requests is not None + assert waiting_requests is not None + assert kv_cache_usage is not None + + return running_requests, waiting_requests, kv_cache_usage + + def test_metrics_exist_run_batch(use_v1: bool): input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501 diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index 839297135f..fdd96c3e95 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -298,7 +298,12 @@ class BlockPool: Returns: The KV cache usage (between 0.0 and 1.0). """ - return 1.0 - (self.get_num_free_blocks() / self.num_gpu_blocks) + + # Subtract 1 to account for null block. + total_gpu_blocks = self.num_gpu_blocks - 1 + if not total_gpu_blocks: + return 0 + return 1.0 - (self.get_num_free_blocks() / total_gpu_blocks) def take_events(self) -> list[KVCacheEvent]: """Atomically takes all events and clears the queue. diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index f9a7e21014..4b167da5c8 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -902,10 +902,13 @@ class Scheduler(SchedulerInterface): finished_requests=finished_set) finished_req_ids.clear() - if engine_core_outputs: + if (stats := self.make_stats(spec_decoding_stats)) is not None: # Return stats to only one of the front-ends. - next(iter(engine_core_outputs.values())).scheduler_stats = ( - self.make_stats(spec_decoding_stats)) + if (eco := next(iter(engine_core_outputs.values()), None)) is None: + # We must return the stats even if there are no request + # outputs this step. + engine_core_outputs[0] = eco = EngineCoreOutputs() + eco.scheduler_stats = stats return engine_core_outputs From d983769c41db224e0897fac2e9aefc5f57ad1122 Mon Sep 17 00:00:00 2001 From: who who who Date: Wed, 20 Aug 2025 14:24:37 +0800 Subject: [PATCH 421/932] fix cuda graph (#22721) Signed-off-by: fsx950223 --- vllm/v1/attention/backends/rocm_aiter_fa.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 7d09ac0a4a..36b5853bfd 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with AiterFlashAttention.""" from dataclasses import dataclass -from typing import ClassVar, Optional +from typing import Optional import torch @@ -11,7 +11,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, +from vllm.v1.attention.backends.utils import (AttentionCGSupport, + AttentionMetadataBuilder, CommonAttentionMetadata) from vllm.v1.kv_cache_interface import AttentionSpec @@ -231,7 +232,7 @@ class AiterFlashAttentionMetadata: class AiterFlashAttentionMetadataBuilder( AttentionMetadataBuilder[AiterFlashAttentionMetadata]): - full_cudagraph_supported: ClassVar[bool] = True + cudagraph_support = AttentionCGSupport.ALWAYS def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): From 103f1ec8d348a5f336f11d972d6285c4fb4736d4 Mon Sep 17 00:00:00 2001 From: Calvin Chen Date: Wed, 20 Aug 2025 18:16:27 +0800 Subject: [PATCH 422/932] [Model] use autoWeightsLoader for gptoss (#22446) Signed-off-by: calvin chen --- vllm/model_executor/models/gpt_oss.py | 724 +++++++++++++------------- 1 file changed, 370 insertions(+), 354 deletions(-) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 2f5d9ddd90..cd93f0ef1e 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -27,7 +27,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import cdiv -from .utils import extract_layer_index, maybe_prefix +from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index, + maybe_prefix) class OAIAttention(nn.Module): @@ -203,6 +204,7 @@ class GptOssModel(nn.Module): super().__init__() self.config = vllm_config.model_config.hf_config self.quant_config = vllm_config.quant_config + self.parallel_config = vllm_config.parallel_config self.config.hidden_size = self.config.hidden_size self.embedding = VocabParallelEmbedding( self.config.vocab_size, @@ -225,8 +227,364 @@ class GptOssModel(nn.Module): x = self.norm(x) return x + def _load_weights_mxfp4( + self, + ep_rank_end: int, + ep_rank_start: int, + heads_per_rank: int, + head_start: int, + weights: Iterable[tuple[str, torch.Tensor]], + stacked_params_mapping: list[tuple[str, ...]], + ) -> set[str]: + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + + mxfp4_block = 32 + use_ep = self.parallel_config.enable_expert_parallel + num_experts = self.config.num_local_experts + + tp_rank = get_tensor_model_parallel_rank() + tp_size = get_tensor_model_parallel_world_size() + + intermediate_size = self.config.intermediate_size + intermediate_size_block = intermediate_size // mxfp4_block + per_rank_intermediate_size_block = cdiv(intermediate_size_block, + tp_size) + per_rank_intermediate_size = (per_rank_intermediate_size_block * + mxfp4_block) + + # Calculate common slicing bounds for current rank + tp_rank_start = tp_rank * per_rank_intermediate_size + tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, + intermediate_size) + + for name, weight in weights: + # FIXME(woosuk): Remove this after testing. + weight = weight.cuda() + + if ".w13_weight_scale" in name: + # Handle MLP gate and up projection weights scale + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, + 2 * tp_rank_start:2 * tp_rank_end, + ...] + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=name, + shard_id=None, + expert_id=None) + loaded_params.add(name) + continue + elif ".w2_weight_scale" in name: + # Handle MLP down projection weights + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[..., tp_rank_start // + mxfp4_block:tp_rank_end // + mxfp4_block] + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=name, + shard_id=None, + expert_id=None) + loaded_params.add(name) + continue + elif ".w13_weight" in name: + # Handle MLP gate and up projection weights + # flat weight from (E, 2 * N, block_size, entry_per_block) + # to (E, 2 * N, -1), shouldn't trigger copy for contiguous + weight = weight.view(num_experts, 2 * intermediate_size, + -1).contiguous() + + # Extract gate and up projection parts + # since the weight is shuffled, we can slice directly + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, + 2 * tp_rank_start:2 * tp_rank_end, + ...] + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=name, + shard_id=None, + expert_id=None) + loaded_params.add(name) + continue + elif ".w2_weight" in name: + # Handle MLP down projection weights + # same flatten here, but since 2 mx4 value are packed in 1 + # uint8, divide by 2 + weight = weight.view(num_experts, -1, + intermediate_size // 2).contiguous() + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[..., + tp_rank_start // 2:tp_rank_end // 2] + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=name, + shard_id=None, + expert_id=None) + loaded_params.add(name) + continue + elif ".w13_bias" in name: + # Handle MLP gate and up projection biases + # Extract gate and up projection bias parts + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, + 2 * tp_rank_start:2 * tp_rank_end] + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=name, + shard_id=None, + expert_id=None) + loaded_params.add(name) + continue + elif ".w2_bias" in name: + # Handle MLP down projection bias + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if use_ep: + weight = weight[ep_rank_start:ep_rank_end, ...] + else: + # (only load on rank 0 to avoid duplication) + if tp_rank != 0: + weight.zero_() + weight_loader(param, + weight, + weight_name=name, + shard_id=None, + expert_id=None) + loaded_params.add(name) + continue + elif "sinks" in name: + # Handle attention sinks (distributed across ranks) + param = params_dict[name] + narrow_weight = weight.narrow(0, head_start, heads_per_rank) + param.data.copy_(narrow_weight) + loaded_params.add(name) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if weight_loader == default_weight_loader: + weight_loader(param, weight) + else: + weight_loader(param, weight, shard_id) + break + else: + # Handle all other weights with potential renaming + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, weight) + loaded_params.add(name) + return loaded_params + + def _load_weights_other( + self, + ep_rank_start: int, + ep_rank_end: int, + heads_per_rank: int, + head_start: int, + weights: Iterable[tuple[str, torch.Tensor]], + stacked_params_mapping: list[tuple[str, ...]], + ) -> set[str]: + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + + use_ep = self.parallel_config.enable_expert_parallel + + tp_rank = get_tensor_model_parallel_rank() + tp_size = get_tensor_model_parallel_world_size() + + intermediate_size = self.config.intermediate_size + per_rank_intermediate_size = cdiv(intermediate_size, tp_size) + # Calculate common slicing bounds for current rank + tp_rank_start = tp_rank * per_rank_intermediate_size + tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, + intermediate_size) + + for name, weight in weights: + if ".w13_weight" in name: + # Handle MLP gate and up projection weights + # Extract gate and up projection parts + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, :, + 2 * tp_rank_start:2 * tp_rank_end] + + narrow_weight = narrow_weight.permute(0, 2, 1).contiguous() + param = params_dict[name] + + param.copy_(narrow_weight) + loaded_params.add(name) + continue + elif ".w2_weight" in name: + # Handle MLP down projection weights + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, tp_rank_start:tp_rank_end, :] + narrow_weight = narrow_weight.permute(0, 2, 1).contiguous() + param = params_dict[name] + + param.copy_(narrow_weight) + loaded_params.add(name) + continue + elif ".w13_bias" in name: + # Handle MLP gate and up projection biases + # Extract gate and up projection bias parts + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, + 2 * tp_rank_start:2 * tp_rank_end] + + param = params_dict[name] + param.copy_(narrow_weight) + loaded_params.add(name) + continue + elif ".w2_bias" in name: + # Handle MLP down projection bias + if use_ep: + weight = weight[ep_rank_start:ep_rank_end, ...] + else: + # (only load on rank 0 to avoid duplication) + if tp_rank != 0: + weight.zero_() + param = params_dict[name] + param.copy_(weight) + loaded_params.add(name) + continue + elif "sinks" in name: + # Handle attention sinks (distributed across ranks) + param = params_dict[name] + narrow_weight = weight.narrow(0, head_start, heads_per_rank) + param.data.copy_(narrow_weight) + loaded_params.add(name) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if weight_loader == default_weight_loader: + weight_loader(param, weight) + else: + weight_loader(param, weight, shard_id) + break + else: + # Handle all other weights with potential renaming + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, weight) + loaded_params.add(name) + return loaded_params + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv", ".q_proj", "q"), + (".qkv", ".k_proj", "k"), + (".qkv", ".v_proj", "v"), + ] + + tp_rank = get_tensor_model_parallel_rank() + tp_size = get_tensor_model_parallel_world_size() + + # Attention heads per rank + heads_per_rank = self.config.num_attention_heads // tp_size + head_start = tp_rank * heads_per_rank + + ep_size = get_ep_group().world_size + ep_rank = get_ep_group().rank + num_experts = self.config.num_local_experts + experts_per_rank = num_experts // ep_size + ep_rank_start = ep_rank * experts_per_rank + ep_rank_end = (ep_rank + 1) * experts_per_rank + + quant_method = (self.config.quantization_config['quant_method'] if + hasattr(self.config, "quantization_config") else None) + if quant_method == "mxfp4": + return self._load_weights_mxfp4(ep_rank_end, ep_rank_start, + heads_per_rank, head_start, + weights, stacked_params_mapping) + else: + return self._load_weights_other(ep_rank_end, ep_rank_start, + heads_per_rank, head_start, + weights, stacked_params_mapping) + class GptOssForCausalLM(nn.Module): + packed_modules_mapping = {"qkv": ["q_proj", "k_proj", "v_proj"]} + + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".self_attn.": ".attn.", + ".post_attention_layernorm.": ".mlp.norm.", + }, + orig_to_new_suffix={ + ".embed_tokens.weight": ".embedding.weight", + ".input_layernorm.weight": ".attn.norm.weight", + ".post_attention_layernorm.weight": ".mlp.norm.weight", + + # MoE MXFP4 weights + ".gate_up_proj_blocks": ".w13_weight", + ".down_proj_blocks": ".w2_weight", + ".gate_up_proj_scales": ".w13_weight_scale", + ".down_proj_scales": ".w2_weight_scale", + + # MoE other weights + ".gate_up_proj": ".w13_weight", + ".down_proj": ".w2_weight", + + # MoE Bias + ".gate_up_proj_bias": ".w13_bias", + ".down_proj_bias": ".w2_bias", + }, + ) def __init__( self, @@ -235,16 +593,17 @@ class GptOssForCausalLM(nn.Module): ): super().__init__() self.vllm_config = vllm_config - self.model_config = vllm_config.model_config.hf_config + self.config = vllm_config.model_config.hf_config + self.model = GptOssModel( vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model"), ) self.lm_head = ParallelLMHead( - self.model_config.vocab_size, - self.model_config.hidden_size, + self.config.vocab_size, + self.config.hidden_size, ) - self.logits_processor = LogitsProcessor(self.model_config.vocab_size) + self.logits_processor = LogitsProcessor(self.config.vocab_size) def forward(self, input_ids: torch.Tensor, @@ -261,354 +620,11 @@ class GptOssForCausalLM(nn.Module): sampling_metadata) return logits - def _load_weights_mxfp4( - self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - rename_mapping = { - "self_attn": "attn", - "input_layernorm.weight": "attn.norm.weight", - "post_attention_layernorm.weight": "mlp.norm.weight", - "embed_tokens": "embedding", - } - - def maybe_rename(name: str) -> str: - for remap_name, new_name in rename_mapping.items(): - if remap_name in name: - return name.replace(remap_name, new_name) - return name - - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - mxfp4_block = 32 - - tp_rank = get_tensor_model_parallel_rank() - tp_size = get_tensor_model_parallel_world_size() - intermediate_size = self.model_config.intermediate_size - intermediate_size_block = intermediate_size // mxfp4_block - per_rank_intermediate_size_block = cdiv(intermediate_size_block, - tp_size) - per_rank_intermediate_size = (per_rank_intermediate_size_block * - mxfp4_block) - - # Calculate common slicing bounds for current rank - tp_rank_start = tp_rank * per_rank_intermediate_size - tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, - intermediate_size) - - # Attention heads per rank - heads_per_rank = self.model_config.num_attention_heads // tp_size - head_start = tp_rank * heads_per_rank - - use_ep = self.vllm_config.parallel_config.enable_expert_parallel - ep_size = get_ep_group().world_size - ep_rank = get_ep_group().rank - num_experts = self.model_config.num_local_experts - experts_per_rank = num_experts // ep_size - ep_rank_start = ep_rank * experts_per_rank - ep_rank_end = (ep_rank + 1) * experts_per_rank - - for name, weight in weights: - # FIXME(woosuk): Remove this after testing. - weight = weight.cuda() - - if "gate_up_proj_blocks" in name: - # Handle MLP gate and up projection weights - new_name = name.replace("gate_up_proj_blocks", "w13_weight") - - # flat weight from (E, 2 * N, block_size, entry_per_block) - # to (E, 2 * N, -1), shouldn't trigger copy for contiguous - weight = weight.view(num_experts, 2 * intermediate_size, - -1).contiguous() - - # Extract gate and up projection parts - # since the weight is shuffled, we can slice directly - if use_ep: - narrow_weight = weight[ep_rank_start:ep_rank_end, ...] - else: - narrow_weight = weight[:, - 2 * tp_rank_start:2 * tp_rank_end, - ...] - - param = params_dict[new_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, - narrow_weight, - weight_name=new_name, - shard_id=None, - expert_id=None) - loaded_params.add(new_name) - - elif "down_proj_blocks" in name: - # Handle MLP down projection weights - new_name = name.replace("down_proj_blocks", "w2_weight") - # same flatten here, but since 2 mx4 value are packed in 1 - # uint8, divide by 2 - weight = weight.view(num_experts, -1, - intermediate_size // 2).contiguous() - if use_ep: - narrow_weight = weight[ep_rank_start:ep_rank_end, ...] - else: - narrow_weight = weight[..., - tp_rank_start // 2:tp_rank_end // 2] - - param = params_dict[new_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, - narrow_weight, - weight_name=new_name, - shard_id=None, - expert_id=None) - loaded_params.add(new_name) - - elif "gate_up_proj_scales" in name: - # Handle MLP gate and up projection weights scale - new_name = name.replace("gate_up_proj_scales", - "w13_weight_scale") - if use_ep: - narrow_weight = weight[ep_rank_start:ep_rank_end, ...] - else: - narrow_weight = weight[:, - 2 * tp_rank_start:2 * tp_rank_end, - ...] - - param = params_dict[new_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, - narrow_weight, - weight_name=new_name, - shard_id=None, - expert_id=None) - loaded_params.add(new_name) - - elif "down_proj_scales" in name: - # Handle MLP down projection weights - new_name = name.replace("down_proj_scales", "w2_weight_scale") - if use_ep: - narrow_weight = weight[ep_rank_start:ep_rank_end, ...] - else: - narrow_weight = weight[..., tp_rank_start // - mxfp4_block:tp_rank_end // - mxfp4_block] - - param = params_dict[new_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, - narrow_weight, - weight_name=new_name, - shard_id=None, - expert_id=None) - loaded_params.add(new_name) - elif "gate_up_proj_bias" in name: - # Handle MLP gate and up projection biases - new_name = name.replace("gate_up_proj_bias", "w13_bias") - - # Extract gate and up projection bias parts - if use_ep: - narrow_weight = weight[ep_rank_start:ep_rank_end, ...] - else: - narrow_weight = weight[:, - 2 * tp_rank_start:2 * tp_rank_end] - - param = params_dict[new_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, - narrow_weight, - weight_name=new_name, - shard_id=None, - expert_id=None) - loaded_params.add(new_name) - - elif "down_proj_bias" in name: - # Handle MLP down projection bias - new_name = name.replace("down_proj_bias", "w2_bias") - param = params_dict[new_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - if use_ep: - weight = weight[ep_rank_start:ep_rank_end, ...] - else: - # (only load on rank 0 to avoid duplication) - if tp_rank != 0: - weight.zero_() - weight_loader(param, - weight, - weight_name=new_name, - shard_id=None, - expert_id=None) - loaded_params.add(new_name) - elif "sinks" in name: - # Handle attention sinks (distributed across ranks) - name = name.replace("self_attn", "attn") - param = params_dict[name] - narrow_weight = weight.narrow(0, head_start, heads_per_rank) - param.data.copy_(narrow_weight) - loaded_params.add(name) - elif "q_proj" in name or "k_proj" in name or "v_proj" in name: - shard_id = ("q" if "q_proj" in name else - "k" if "k_proj" in name else "v") - name = name.replace("self_attn", "attn") - param_name = name.replace(f"{shard_id}_proj", "qkv") - param = params_dict[param_name] - weight_loader = param.weight_loader - weight_loader(param, weight, loaded_shard_id=shard_id) - loaded_params.add(param_name) - else: - # Handle all other weights with potential renaming - renamed_name = maybe_rename(name) - if renamed_name not in params_dict: - continue - param = params_dict[renamed_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, weight) - loaded_params.add(renamed_name) - - return loaded_params - - def _load_weights_other( - self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - rename_mapping = { - "self_attn": "attn", - "input_layernorm.weight": "attn.norm.weight", - "post_attention_layernorm.weight": "mlp.norm.weight", - "embed_tokens": "embedding", - } - - def maybe_rename(name: str) -> str: - for remap_name, new_name in rename_mapping.items(): - if remap_name in name: - return name.replace(remap_name, new_name) - return name - - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - - tp_rank = get_tensor_model_parallel_rank() - tp_size = get_tensor_model_parallel_world_size() - intermediate_size = self.model_config.intermediate_size - - per_rank_intermediate_size = cdiv(intermediate_size, tp_size) - # Calculate common slicing bounds for current rank - tp_rank_start = tp_rank * per_rank_intermediate_size - tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, - intermediate_size) - - # Attention heads per rank - heads_per_rank = self.model_config.num_attention_heads // tp_size - head_start = tp_rank * heads_per_rank - - use_ep = self.vllm_config.parallel_config.enable_expert_parallel - ep_size = get_ep_group().world_size - ep_rank = get_ep_group().rank - num_experts = self.model_config.num_local_experts - experts_per_rank = num_experts // ep_size - ep_rank_start = ep_rank * experts_per_rank - ep_rank_end = (ep_rank + 1) * experts_per_rank - - for name, weight in weights: - if ".experts.gate_up_proj" in name and "bias" not in name: - # Handle MLP gate and up projection weights - new_name = name.replace(".experts.gate_up_proj", - ".experts.w13_weight") - - # Extract gate and up projection parts - # since the weight is shuffled, we can slice directly - if use_ep: - narrow_weight = weight[ep_rank_start:ep_rank_end, ...] - else: - narrow_weight = weight[:, :, - 2 * tp_rank_start:2 * tp_rank_end] - - narrow_weight = narrow_weight.permute(0, 2, 1).contiguous() - param = params_dict[new_name] - - param.copy_(narrow_weight) - loaded_params.add(new_name) - - elif ".experts.down_proj" in name and "bias" not in name: - # Handle MLP down projection weights - new_name = name.replace(".experts.down_proj", - ".experts.w2_weight") - - if use_ep: - narrow_weight = weight[ep_rank_start:ep_rank_end, ...] - else: - narrow_weight = weight[:, tp_rank_start:tp_rank_end, :] - narrow_weight = narrow_weight.permute(0, 2, 1).contiguous() - param = params_dict[new_name] - - param.copy_(narrow_weight) - loaded_params.add(new_name) - - elif "gate_up_proj_bias" in name: - # Handle MLP gate and up projection biases - new_name = name.replace("gate_up_proj_bias", "w13_bias") - - # Extract gate and up projection bias parts - if use_ep: - narrow_weight = weight[ep_rank_start:ep_rank_end, ...] - else: - narrow_weight = weight[:, - 2 * tp_rank_start:2 * tp_rank_end] - - param = params_dict[new_name] - - param.copy_(narrow_weight) - loaded_params.add(new_name) - - elif "down_proj_bias" in name: - # Handle MLP down projection bias - new_name = name.replace("down_proj_bias", "w2_bias") - - if use_ep: - weight = weight[ep_rank_start:ep_rank_end, ...] - else: - # (only load on rank 0 to avoid duplication) - if tp_rank != 0: - weight.zero_() - param = params_dict[new_name] - param.copy_(weight) - loaded_params.add(new_name) - elif "sinks" in name: - # Handle attention sinks (distributed across ranks) - name = name.replace("self_attn", "attn") - param = params_dict[name] - narrow_weight = weight.narrow(0, head_start, heads_per_rank) - param.data.copy_(narrow_weight) - loaded_params.add(name) - elif "q_proj" in name or "k_proj" in name or "v_proj" in name: - shard_id = ("q" if "q_proj" in name else - "k" if "k_proj" in name else "v") - name = name.replace("self_attn", "attn") - param_name = name.replace(f"{shard_id}_proj", "qkv") - param = params_dict[param_name] - weight_loader = param.weight_loader - weight_loader(param, weight, loaded_shard_id=shard_id) - loaded_params.add(param_name) - else: - # Handle all other weights with potential renaming - - renamed_name = maybe_rename(name) - if renamed_name not in params_dict: - continue - param = params_dict[renamed_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, weight) - loaded_params.add(renamed_name) - - return loaded_params - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - quant_method = (self.model_config.quantization_config['quant_method'] - if hasattr(self.model_config, "quantization_config") - else None) - if quant_method == "mxfp4": - return self._load_weights_mxfp4(weights) - else: - return self._load_weights_other(weights) + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) From 3aa8c100381a1c6a99a259d9da5dac70fd3a6c0b Mon Sep 17 00:00:00 2001 From: Shiming Zhang Date: Wed, 20 Aug 2025 18:46:59 +0800 Subject: [PATCH 423/932] Fix missing quotes (#23242) Signed-off-by: Shiming Zhang --- docs/deployment/frameworks/dstack.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md index 23dc58c974..fe4d87f78f 100644 --- a/docs/deployment/frameworks/dstack.md +++ b/docs/deployment/frameworks/dstack.md @@ -9,7 +9,7 @@ vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), To install dstack client, run: ```bash -pip install "dstack[all] +pip install dstack[all] dstack server ``` From 83e69a09d6c1a5e88ae00060e79ec7b7a9465462 Mon Sep 17 00:00:00 2001 From: Xin Yang <105740670+xyang16@users.noreply.github.com> Date: Wed, 20 Aug 2025 04:01:31 -0700 Subject: [PATCH 424/932] [Model] Support deepseek with eagle (#21086) Signed-off-by: Xin Yang --- tests/models/registry.py | 3 + tests/v1/e2e/test_spec_decode.py | 6 +- vllm/model_executor/models/deepseek_eagle.py | 246 +++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 4 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 vllm/model_executor/models/deepseek_eagle.py diff --git a/tests/models/registry.py b/tests/models/registry.py index 28fe906316..739d962279 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -530,6 +530,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { "DeepSeekMTPModel": _HfExamplesInfo("luccafong/deepseek_mtp_main_random", speculative_model="luccafong/deepseek_mtp_draft_random", # noqa: E501 trust_remote_code=True), + "EagleDeepSeekMTPModel": _HfExamplesInfo("eagle618/deepseek-v3-random", + speculative_model="eagle618/eagle-deepseek-v3-random", # noqa: E501 + trust_remote_code=True), "EagleLlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE-LLaMA3-Instruct-8B", trust_remote_code=True, speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B", diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 7b3f458312..bd0fa6b807 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -144,6 +144,8 @@ def test_ngram_correctness( "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), True, marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), + (("eagle", "eagle618/deepseek-v3-random", + "eagle618/eagle-deepseek-v3-random", 1), False), ], ids=[ # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501 @@ -151,7 +153,8 @@ def test_ngram_correctness( "llama3_eagle", "llama3_eagle3", "llama4_eagle", - "llama4_eagle_mm" + "llama4_eagle_mm", + "deepseek_eagle" ]) @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform()) @@ -177,6 +180,7 @@ def test_eagle_correctness( ''' with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") + m.setenv("VLLM_MLA_DISABLE", "1") m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) if (attn_backend == "TRITON_ATTN_VLLM_V1" diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py new file mode 100644 index 0000000000..0c9c83cf61 --- /dev/null +++ b/vllm/model_executor/models/deepseek_eagle.py @@ -0,0 +1,246 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Iterable +from typing import Optional + +import torch +import torch.nn as nn + +from vllm.compilation.decorators import support_torch_compile +from vllm.config import VllmConfig +from vllm.distributed.parallel_state import get_pp_group +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.models.deepseek_v2 import (DeepseekV2DecoderLayer, + DeepseekV3ForCausalLM) +from vllm.model_executor.sampling_metadata import SamplingMetadata + +from .utils import AutoWeightsLoader, maybe_prefix + + +@support_torch_compile +class DeepseekV2Model(nn.Module): + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + start_layer_id: int = 0, + ) -> None: + super().__init__() + self.config = vllm_config. \ + speculative_config.draft_model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.vocab_size = self.config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + self.config.vocab_size, + self.config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "embed_tokens"), + ) + + self.layers = nn.ModuleList([ + DeepseekV2DecoderLayer( + self.config, + prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"), + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + ) for i in range(self.config.num_hidden_layers) + ]) + + self.fc = nn.Linear( + self.config.model.hidden_size * 2, + self.config.model.hidden_size, + bias=False, + ) + + self.enorm = RMSNorm(self.config.hidden_size, + eps=self.config.rms_norm_eps) + self.hnorm = RMSNorm(self.config.hidden_size, + eps=self.config.rms_norm_eps) + self.norm = RMSNorm(self.config.hidden_size, + eps=self.config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + input_embeds = self.embed_tokens(input_ids) + + inputs = torch.cat( + [self.enorm(input_embeds), + self.hnorm(hidden_states)], dim=-1) + hidden_states = self.fc(inputs) + residual = None + for layer in self.layers: + hidden_states, residual = layer( + positions, + hidden_states, + residual, + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states, hidden_states + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ("fused_qkv_a_proj", "q_a_proj", 0), + ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.n_routed_experts) + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + for param_name, weight_name, shard_id in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if ("mlp.experts." in name) and name not in params_dict: + continue + name_mapped = name.replace(weight_name, param_name) + + # QKV fusion is optional, fall back to normal + # weight loading if it's not enabled + # if go with fusion option, then update name + if ((param_name == "fused_qkv_a_proj") + and name_mapped not in params_dict): + continue + else: + name = name_mapped + + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader( + param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id, + ) + break + else: + # if PP disabled then draft will share embed with target + if get_pp_group().world_size == 1 and \ + "embed_tokens." in name: + continue + + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + nn.Module.__init__(self) + self.config = vllm_config. \ + speculative_config.draft_model_config.hf_config + quant_config = vllm_config.quant_config + target_layer_num = vllm_config.model_config.get_num_layers( + vllm_config.parallel_config) + self.model = DeepseekV2Model(vllm_config=vllm_config, + prefix="model", + start_layer_id=target_layer_num) + + self.lm_head = ParallelLMHead(self.config.vocab_size, + self.config.hidden_size, + quant_config=quant_config) + + logit_scale = getattr(self.config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor(self.config.vocab_size, + scale=logit_scale) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + if inputs_embeds is not None: + raise NotImplementedError( + f"{type(self).__name__} does not support multimodal inputs yet." + ) + return self.model(input_ids, positions, hidden_states) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader( + self, + skip_prefixes=None, + ) + + model_weights = {} + for name, loaded_weight in weights: + if "lm_head" not in name: + name = "model." + name + model_weights[name] = loaded_weight + loader.load_weights(model_weights.items()) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 8728684d8e..a94231b0f8 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -264,6 +264,7 @@ _SPECULATIVE_DECODING_MODELS = { "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501 # "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"), + "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"), "MedusaModel": ("medusa", "Medusa"), From 68fcd3fa7313d00240f766f42affe931f1f379a7 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 20 Aug 2025 19:09:18 +0800 Subject: [PATCH 425/932] [Bugfix] Ensure correctness of Cohere2Vision processing (#23245) Signed-off-by: DarkLight1337 --- .../multimodal/processing/test_common.py | 1 + vllm/model_executor/models/aya_vision.py | 3 +- vllm/model_executor/models/cohere2_vision.py | 71 ++++++++++++++----- 3 files changed, 56 insertions(+), 19 deletions(-) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 8aa0dc7e8e..d5b1de834a 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -268,6 +268,7 @@ def _test_processing_correctness_one( "CohereForAI/aya-vision-8b", "Salesforce/blip2-opt-2.7b", "facebook/chameleon-7b", + "CohereLabs/command-a-vision-07-2025", "deepseek-ai/deepseek-vl2-tiny", "microsoft/Florence-2-base", "adept/fuyu-8b", diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index b02a973d94..687c82ded9 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -250,8 +250,7 @@ class AyaVisionMultiModalProcessor( image_processor = hf_processor.image_processor def get_replacement(item_idx: int): - images: ImageProcessorItems = mm_items.get("image", - ImageProcessorItems) + images = mm_items.get_items("image", ImageProcessorItems) image_size: ImageSize = images.get_image_size(item_idx) num_patches = self.info.get_num_patches( image_width=image_size.width, diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index bc526fd661..4682a8a428 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -10,6 +10,8 @@ import torch from torch import nn from transformers import BatchFeature, PretrainedConfig from transformers.models.cohere2_vision import Cohere2VisionConfig +from transformers.models.cohere2_vision.image_processing_cohere2_vision_fast import ( # noqa: E501 + get_optimal_tiled_canvas) from transformers.models.cohere2_vision.processing_cohere2_vision import ( Cohere2VisionProcessor) @@ -150,14 +152,46 @@ class Cohere2VisionProcessingInfo(BaseProcessingInfo): max_patches = image_processor.max_patches return ImageSize(height=height * max_patches, width=width) - def get_num_patches(self, image_width: int, image_height: int) -> int: + def get_num_patches( + self, + *, + image_width: int, + image_height: int, + processor: Optional[Cohere2VisionProcessor], + ) -> int: """ Calculate the number of image patches for a given image. Uses the HF processor to determine the actual number of patches. """ - return self.get_hf_processor( - ).image_processor.get_number_of_image_patches(image_height, - image_width, {}) + if processor is None: + processor = self.get_hf_processor() + + image_processor = processor.image_processor + + # The current implementation of get_number_of_image_patches + # is incorrect, so we patch it here. + # return image_processor.get_number_of_image_patches(image_height, + # image_width, {}) + + min_patches = image_processor.min_patches + max_patches = image_processor.max_patches + patch_size = image_processor.size + crop_to_patches = image_processor.crop_to_patches + + if not crop_to_patches: + return 1 + + num_columns, num_rows = get_optimal_tiled_canvas( + (image_height, image_width), + (patch_size["height"], patch_size["width"]), + min_patches, + max_patches, + ) + num_patches = num_columns * num_rows + if num_patches > 1: + num_patches += 1 # Thumbnail image + + return num_patches class Cohere2VisionDummyInputsBuilder( @@ -208,6 +242,8 @@ class Cohere2VisionMultiModalProcessor( # Ensure num_patches is available for proper tensor splitting if "num_patches" not in processed_outputs and ( images := mm_data.get("images")) is not None: + hf_processor = self.info.get_hf_processor(**mm_kwargs) + # Fallback calculation if HF processor didn't provide num_patches parsed_images = self._get_data_parser().parse_mm_data({ "image": @@ -217,8 +253,9 @@ class Cohere2VisionMultiModalProcessor( num_patches = [ self.info.get_num_patches( image_width=parsed_images.get_image_size(i).width, - image_height=parsed_images.get_image_size(i).height) - for i in range(len(parsed_images)) + image_height=parsed_images.get_image_size(i).height, + processor=hf_processor, + ) for i in range(len(parsed_images)) ] processed_outputs["num_patches"] = torch.tensor(num_patches) @@ -245,25 +282,25 @@ class Cohere2VisionMultiModalProcessor( ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_token = hf_processor.image_token + img_tokens_per_tile = int(hf_processor.patch_size**2) img_line_break_token = hf_processor.img_line_break_token boi_token = hf_processor.boi_token eoi_token = hf_processor.eoi_token def get_replacement(item_idx: int): - images: ImageProcessorItems = mm_items.get("image", - ImageProcessorItems) + images = mm_items.get_items("image", ImageProcessorItems) image_size: ImageSize = images.get_image_size(item_idx) - num_patches = self.info.get_num_patches(image_size.height, - image_size.width) - img_tokens_per_tile = int(hf_processor.patch_size**2) - single_tile_tokens = image_token * img_tokens_per_tile + \ - img_line_break_token - img_string = f"{boi_token}\ - {single_tile_tokens * num_patches}\ - {eoi_token}" + num_patches = self.info.get_num_patches( + image_width=image_size.width, + image_height=image_size.height, + processor=hf_processor, + ) + patch_tokens = (image_token * img_tokens_per_tile + + img_line_break_token) + repl = f"{boi_token}{patch_tokens * num_patches}{eoi_token}" - return PromptUpdateDetails.select_text(img_string, image_token) + return PromptUpdateDetails.select_text(repl, image_token) return [ PromptReplacement( From 50df09fe13c93b520c64c581de4f0b469995f7b9 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 20 Aug 2025 08:05:54 -0400 Subject: [PATCH 426/932] Update to flashinfer-python==0.2.12 and disable AOT compile for non-release image (#23129) Signed-off-by: mgoin --- .buildkite/release-pipeline.yaml | 2 +- docker/Dockerfile | 52 ++++++++++++++++++++------------ setup.py | 2 +- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 85d3e56387..e20ce54ca7 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -68,7 +68,7 @@ steps: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - label: "Annotate release workflow" diff --git a/docker/Dockerfile b/docker/Dockerfile index 7493891778..cfaa598682 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -372,31 +372,45 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # Install FlashInfer from source ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" -# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt -# We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel. -ARG FLASHINFER_GIT_REF="v0.2.11" +# Keep this in sync with "flashinfer" extra in setup.py +ARG FLASHINFER_GIT_REF="v0.2.12" +# Flag to control whether to compile FlashInfer AOT kernels +# Set to "true" to enable AOT compilation: +# docker build --build-arg FLASHINFER_AOT_COMPILE=true ... +ARG FLASHINFER_AOT_COMPILE=false RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' . /etc/environment git clone --depth 1 --recursive --shallow-submodules \ --branch ${FLASHINFER_GIT_REF} \ ${FLASHINFER_GIT_REPO} flashinfer - # Exclude CUDA arches for older versions (11.x and 12.0-12.7) - # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg. - if [[ "${CUDA_VERSION}" == 11.* ]]; then - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9" - elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" - else - # CUDA 12.8+ supports 10.0a and 12.0 - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0" - fi - echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}" - # Needed to build AOT kernels pushd flashinfer - TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - python3 -m flashinfer.aot - TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - uv pip install --system --no-build-isolation --force-reinstall --no-deps . + if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then + # Exclude CUDA arches for older versions (11.x and 12.0-12.7) + # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg. + if [[ "${CUDA_VERSION}" == 11.* ]]; then + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9" + elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" + else + # CUDA 12.8+ supports 10.0a and 12.0 + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0" + fi + echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}" + # Build AOT kernels + TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ + python3 -m flashinfer.aot + # Install with no-build-isolation since we already built AOT kernels + TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ + uv pip install --system --no-build-isolation . \ + --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') + # Download pre-compiled cubins + TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ + python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins." + else + echo "🏗️ Installing FlashInfer without AOT compilation in JIT mode" + uv pip install --system . \ + --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') + fi popd rm -rf flashinfer BASH diff --git a/setup.py b/setup.py index cc3037ebb7..6a3013de79 100644 --- a/setup.py +++ b/setup.py @@ -685,7 +685,7 @@ setup( "mistral_common[audio]"], # Required for audio processing "video": [], # Kept for backwards compatibility # FlashInfer should be updated together with the Dockerfile - "flashinfer": ["flashinfer-python==0.2.11"], + "flashinfer": ["flashinfer-python==0.2.12"], }, cmdclass=cmdclass, package_data=package_data, From 7cd17e22d76473919c55aa75ac1897e4d3fbe277 Mon Sep 17 00:00:00 2001 From: xyxinyang <43821961+xyxinyang@users.noreply.github.com> Date: Wed, 20 Aug 2025 20:41:55 +0800 Subject: [PATCH 427/932] [Model][V1] Support Ernie MTP (#22169) Signed-off-by: zhouchong Co-authored-by: zhouchong --- tests/models/registry.py | 3 + vllm/config/__init__.py | 31 ++- vllm/model_executor/models/ernie_mtp.py | 287 ++++++++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/v1/spec_decode/eagle.py | 2 +- vllm/worker/worker.py | 3 +- 6 files changed, 320 insertions(+), 7 deletions(-) create mode 100644 vllm/model_executor/models/ernie_mtp.py diff --git a/tests/models/registry.py b/tests/models/registry.py index 739d962279..6e6acfb8cd 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -556,6 +556,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { is_available_online=False, speculative_model="openbmb/MiniCPM-2B-sft-bf16", tokenizer="openbmb/MiniCPM-2B-sft-bf16"), + "ErnieMTPModel": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT", + trust_remote_code=True, + speculative_model="baidu/ERNIE-4.5-21B-A3B-PT"), "Glm4MoeMTPModel": _HfExamplesInfo("zai-org/GLM-4.5", speculative_model="zai-org/GLM-4.5", min_transformers_version="4.54", diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 56a749789b..801fa97fe5 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1463,7 +1463,8 @@ class ModelConfig: from vllm.distributed.utils import get_pp_indices if (self.hf_text_config.model_type == "deepseek_mtp" or self.hf_config.model_type == "mimo_mtp" - or self.hf_config.model_type == "glm4_moe_mtp"): + or self.hf_config.model_type == "glm4_moe_mtp" + or self.hf_config.model_type == "ernie_mtp"): total_num_hidden_layers = getattr(self.hf_text_config, "num_nextn_predict_layers", 0) else: @@ -1911,7 +1912,8 @@ class DeviceConfig: SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa", - "mlp_speculator", "draft_model", "deepseek_mtp"] + "mlp_speculator", "draft_model", "deepseek_mtp", + "ernie_mtp"] @config @@ -2044,6 +2046,16 @@ class SpeculativeConfig: "architectures": ["Glm4MoeMTPModel"] }) + if hf_config.model_type == "ernie4_5_moe": + hf_config.model_type = "ernie_mtp" + if hf_config.model_type == "ernie_mtp": + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update({ + "n_predict": n_predict, + "architectures": ["ErnieMTPModel"] + }) + return hf_config + return hf_config def __post_init__(self): @@ -2062,8 +2074,8 @@ class SpeculativeConfig: if self.target_model_config and \ (self.target_model_config.hf_text_config.model_type \ == "deepseek_v3" or - self.target_model_config.hf_text_config.model_type \ - == "mimo"): + self.target_model_config.hf_text_config.model_type in + ("mimo","ernie4_5_moe")): # use the draft model from the same model: self.model = self.target_model_config.model elif self.method in ("ngram", "[ngram]"): @@ -2161,6 +2173,15 @@ class SpeculativeConfig: "one layer. Might need some code changes " \ "to support multiple layers." ) + elif (self.draft_model_config.hf_config.model_type == + "ernie_mtp"): + self.method = "ernie_mtp" + if self.num_speculative_tokens > 1: + logger.warning( + "All Ernie MTP models only have " \ + "one layer. Might need some code changes " \ + "to support multiple layers." + ) else: self.method = "draft_model" raise NotImplementedError( @@ -2376,7 +2397,7 @@ class SpeculativeConfig: return self.num_speculative_tokens def use_eagle(self) -> bool: - return self.method in ("eagle", "eagle3", "deepseek_mtp") + return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp") def __repr__(self) -> str: method = self.method diff --git a/vllm/model_executor/models/ernie_mtp.py b/vllm/model_executor/models/ernie_mtp.py new file mode 100644 index 0000000000..90a1267b28 --- /dev/null +++ b/vllm/model_executor/models/ernie_mtp.py @@ -0,0 +1,287 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The Baidu team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Ernie-MTP model.""" +from collections.abc import Iterable +from typing import Optional + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.config import CacheConfig, ModelConfig, VllmConfig +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .interfaces import SupportsPP +from .llama import LlamaDecoderLayer +from .utils import is_pp_missing_parameter, maybe_prefix + + +class ErnieMultiTokenPredictorLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + prefix: str, + model_config: ModelConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + + self.mtp_emb_norm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.mtp_hidden_norm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.mtp_linear_proj = nn.Linear(config.hidden_size * 2, + config.hidden_size, + bias=False) + self.mtp_block = LlamaDecoderLayer(config, cache_config, quant_config, + prefix) + + def forward( + self, + inputs_embeds: torch.Tensor, + positions: torch.Tensor, + previous_hidden_states: torch.Tensor, + spec_step_index: int = 0, + ) -> torch.Tensor: + assert inputs_embeds is not None + # masking inputs at position 0, as not needed by MTP + inputs_embeds[positions == 0] = 0 + + inputs_embeds = self.mtp_emb_norm(inputs_embeds) + previous_hidden_states = self.mtp_hidden_norm(previous_hidden_states) + + hidden_states = self.mtp_linear_proj( + torch.cat([inputs_embeds, previous_hidden_states], dim=-1)) + + hidden_states, residual = self.mtp_block(positions=positions, + hidden_states=hidden_states, + residual=None) + hidden_states = residual + hidden_states + + return hidden_states + + +class ErnieMultiTokenPredictor(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + self.mtp_start_layer_idx = config.num_hidden_layers + self.num_mtp_layers = config.num_nextn_predict_layers + # to map the exact layer index from weights + self.layers = torch.nn.ModuleDict({ + str(idx): + ErnieMultiTokenPredictorLayer( + config, + f"{prefix}.layers.{idx}", + model_config=vllm_config.model_config, + cache_config=vllm_config.cache_config, + ) + for idx in range(self.mtp_start_layer_idx, + self.mtp_start_layer_idx + self.num_mtp_layers) + }) + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.logits_processor = LogitsProcessor(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + previous_hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_idx: int = 0, + ) -> torch.Tensor: + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + return self.layers[str(self.mtp_start_layer_idx + spec_step_idx)]( + inputs_embeds, + positions, + previous_hidden_states, + spec_step_idx, + ) + + def compute_logits( + self, + hidden_states: torch.Tensor, + lm_head: ParallelLMHead, + sampling_metadata: SamplingMetadata, + spec_step_idx: int = 0, + ) -> torch.Tensor: + self.layers[str(self.mtp_start_layer_idx + spec_step_idx)] + logits = self.logits_processor(lm_head, hidden_states, + sampling_metadata) + return logits + + +class ErnieMTP(nn.Module, SupportsPP): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + self.config = vllm_config.model_config.hf_config + self.model = ErnieMultiTokenPredictor(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "model")) + self.lm_head = ParallelLMHead(self.config.vocab_size, + self.config.hidden_size) + self.sampler = get_sampler() + + if self.config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_idx: int = 0, + ) -> torch.Tensor: + assert spec_step_idx == 0, "ernie_mtp only support predict one token" + hidden_states = self.model(input_ids, positions, hidden_states, + inputs_embeds, spec_step_idx) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + spec_step_idx: int = 0, + ) -> Optional[torch.Tensor]: + return self.model.compute_logits(hidden_states, self.lm_head, + sampling_metadata, spec_step_idx) + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + + if self.config.tie_word_embeddings and name.endswith( + "lm_head.weight"): + continue + if "rotary_emb.inv_freq" in name: + continue + if "mtp" in name: + name = self._rewrite_spec_layer_name(self.config, name) + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + if "mtp" not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if (("mlp.experts." in name) and name not in params_dict): + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + + # According to DeepSeek-V3 Technical Report, MTP modules + # shares embedding layer. We only load the first weights. + if "mtp_" not in name and ("embed_tokens" not in name + and "lm_head" not in name): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + def _rewrite_spec_layer_name(self, config: PretrainedConfig, + name: str) -> str: + """ + Rewrite the weight name to match the format of the original model. + """ + spec_layer_weight_names = [ + "embed_tokens", "mtp_emb_norm", "mtp_hidden_norm", + "mtp_linear_proj" + ] + layer_idx = config.num_hidden_layers + for weight_name in spec_layer_weight_names: + if weight_name in name: + name = name.replace( + f"model.{weight_name}.0.", + f"model.layers.{layer_idx}.{weight_name}.") + return name + name = name.replace("model.mtp_block.0.", + f"model.layers.{layer_idx}.mtp_block.") + return name diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index a94231b0f8..78ef270598 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -266,6 +266,7 @@ _SPECULATIVE_DECODING_MODELS = { # "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), + "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"), "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"), "MedusaModel": ("medusa", "Medusa"), # Temporarily disabled. diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index a8a160a0f9..8cd2ad12cf 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -194,7 +194,7 @@ class EagleProposer: hidden_states=self.hidden_states[:num_input_tokens], inputs_embeds=inputs_embeds, ) - if self.method == "deepseek_mtp": + if self.method in ("deepseek_mtp", "ernie_mtp"): last_hidden_states = ret_hidden_states else: last_hidden_states, hidden_states = ret_hidden_states diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 9dfea94756..7a01e585ba 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -77,7 +77,8 @@ class Worker(LocalOrDistributedWorkerBase): "eagle", "deepseek_mtp", "glm4_moe_mtp", - "mimo_mtp")) \ + "mimo_mtp", + "ernie_mtp")) \ else {"return_hidden_states": True} ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner From c6d80a7a9620637ba5016dd3c0d6061e79eed73c Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 20 Aug 2025 20:47:05 +0800 Subject: [PATCH 428/932] [Model] Improve olmo and olmo2 (#23228) Signed-off-by: Jee Jee Li --- docs/models/supported_models.md | 4 ++-- vllm/model_executor/models/olmo.py | 22 +++++++++++++++++++--- vllm/model_executor/models/olmo2.py | 17 +++++++++++++++-- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 7908e42387..7308d00106 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -384,8 +384,8 @@ th { | `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ | | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ | | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | | ✅︎ | ✅︎ | -| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | | ✅︎ | ✅︎ | +| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ | | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ | | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ | diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 1dc4df85c1..01639d3981 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -47,7 +47,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsPP +from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -91,6 +91,7 @@ class OlmoAttention(nn.Module): self.total_num_heads, bias=config.attention_bias, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) # Rotary embeddings. @@ -114,6 +115,7 @@ class OlmoAttention(nn.Module): self.hidden_size, bias=config.attention_bias, quant_config=quant_config, + prefix=f"{prefix}.o_proj", ) def forward( @@ -142,6 +144,7 @@ class OlmoMLP(nn.Module): self, config: OlmoConfig, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.config = config @@ -154,6 +157,7 @@ class OlmoMLP(nn.Module): [self.intermediate_size] * 2, bias=False, quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", ) # Activation function. @@ -165,6 +169,7 @@ class OlmoMLP(nn.Module): self.hidden_size, bias=False, quant_config=quant_config, + prefix=f"{prefix}.down_proj", ) def forward( @@ -197,7 +202,7 @@ class OlmoDecoderLayer(nn.Module): prefix=f"{prefix}.self_attn") # MLP block. - self.mlp = OlmoMLP(config, quant_config) + self.mlp = OlmoMLP(config, quant_config, prefix=f"{prefix}.mlp") # LayerNorm self.input_layernorm = nn.LayerNorm(config.hidden_size, @@ -326,10 +331,21 @@ class OlmoModel(nn.Module): return loaded_params -class OlmoForCausalLM(nn.Module, SupportsPP): +class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA): """ Extremely barebones HF model wrapper. """ + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 499e6d30ed..66a0f91155 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -33,6 +33,7 @@ from torch import nn from transformers import Olmo2Config from vllm.attention import Attention +from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.distributed.communication_op import tensor_model_parallel_all_gather @@ -48,7 +49,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.interfaces import SupportsPP +from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP from vllm.model_executor.models.utils import ( AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -253,6 +254,7 @@ class Olmo2DecoderLayer(nn.Module): return hidden_states +@support_torch_compile class Olmo2Model(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -354,10 +356,21 @@ class Olmo2Model(nn.Module): return loaded_params -class Olmo2ForCausalLM(nn.Module, SupportsPP): +class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): """ Extremely barebones HF model wrapper. """ + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() From 38217877aa70041c0115ee367b75197af9cbc5ad Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Wed, 20 Aug 2025 21:34:49 +0800 Subject: [PATCH 429/932] [Fix] fix offline env use local mode path (#22526) Signed-off-by: rongfu.leng --- .../offline_mode/test_offline_mode.py | 35 +++++++++++++++++++ vllm/engine/arg_utils.py | 10 +++++- vllm/transformers_utils/config.py | 23 ++++++++++-- 3 files changed, 65 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index a606eeab58..dd8d63ad31 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for HF_HUB_OFFLINE mode""" +import dataclasses import importlib import sys @@ -9,6 +10,7 @@ import urllib3 from vllm import LLM from vllm.distributed import cleanup_dist_env_and_memory +from vllm.engine.arg_utils import EngineArgs MODEL_CONFIGS = [ { @@ -108,3 +110,36 @@ def _re_import_modules(): # Error this test if reloading a module failed if reload_exception is not None: raise reload_exception + + +@pytest.mark.skip_global_cleanup +@pytest.mark.usefixtures("cache_models") +def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch): + # Set HF to offline mode and ensure we can still construct an LLM + with monkeypatch.context() as m: + try: + m.setenv("HF_HUB_OFFLINE", "1") + m.setenv("VLLM_NO_USAGE_STATS", "1") + + def disable_connect(*args, **kwargs): + raise RuntimeError("No http calls allowed") + + m.setattr( + urllib3.connection.HTTPConnection, + "connect", + disable_connect, + ) + m.setattr( + urllib3.connection.HTTPSConnection, + "connect", + disable_connect, + ) + # Need to re-import huggingface_hub + # and friends to setup offline mode + _re_import_modules() + engine_args = EngineArgs(model="facebook/opt-125m") + LLM(**dataclasses.asdict(engine_args)) + finally: + # Reset the environment after the test + # NB: Assuming tests are run in online mode + _re_import_modules() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 679905aed9..48d9cd08af 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -15,6 +15,7 @@ from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List, Literal, Optional, Type, TypeVar, Union, cast, get_args, get_origin) +import huggingface_hub import regex as re import torch from pydantic import TypeAdapter, ValidationError @@ -39,7 +40,7 @@ from vllm.plugins import load_general_plugins from vllm.ray.lazy_utils import is_ray_initialized from vllm.reasoning import ReasoningParserManager from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 -from vllm.transformers_utils.config import is_interleaved +from vllm.transformers_utils.config import get_model_path, is_interleaved from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor) @@ -457,6 +458,13 @@ class EngineArgs: # Setup plugins from vllm.plugins import load_general_plugins load_general_plugins() + # when use hf offline,replace model id to local model path + if huggingface_hub.constants.HF_HUB_OFFLINE: + model_id = self.model + self.model = get_model_path(self.model, self.revision) + logger.info( + "HF_HUB_OFFLINE is True, replace model_id [%s] " \ + "to model_path [%s]",model_id, self.model) @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index d8c964fb2a..fe345bd8f0 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -14,7 +14,7 @@ from huggingface_hub import get_safetensors_metadata, hf_hub_download from huggingface_hub import list_repo_files as hf_list_repo_files from huggingface_hub import try_to_load_from_cache from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError, - HFValidationError, LocalEntryNotFoundError, + LocalEntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError) from transformers import GenerationConfig, PretrainedConfig @@ -335,6 +335,7 @@ def maybe_override_with_speculators_target_model( gguf_model_repo = Path(model).parent else: gguf_model_repo = None + kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE config_dict, _ = PretrainedConfig.get_config_dict( model if gguf_model_repo is None else gguf_model_repo, revision=revision, @@ -400,6 +401,7 @@ def get_config( raise ValueError(error_message) from e if config_format == ConfigFormat.HF: + kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE config_dict, _ = PretrainedConfig.get_config_dict( model, revision=revision, @@ -532,7 +534,7 @@ def try_get_local_file(model: Union[str, Path], revision=revision) if isinstance(cached_filepath, str): return Path(cached_filepath) - except HFValidationError: + except ValueError: ... return None @@ -908,3 +910,20 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int: exc_info=e) return max_position_embeddings + + +def get_model_path(model: Union[str, Path], revision: Optional[str] = None): + if os.path.exists(model): + return model + assert huggingface_hub.constants.HF_HUB_OFFLINE + common_kwargs = { + "local_files_only": huggingface_hub.constants.HF_HUB_OFFLINE, + "revision": revision, + } + + if envs.VLLM_USE_MODELSCOPE: + from modelscope.hub.snapshot_download import snapshot_download + return snapshot_download(model_id=model, **common_kwargs) + + from huggingface_hub import snapshot_download + return snapshot_download(repo_id=model, **common_kwargs) From 44492358439f612b3934ccd902dbd90fcfa19866 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 20 Aug 2025 22:19:30 +0800 Subject: [PATCH 430/932] [Bugfix] Ensure correctness of HCXVision processing (#23254) Signed-off-by: DarkLight1337 --- .../multimodal/processing/test_common.py | 2 +- .../models/hyperclovax_vision.py | 116 ++++++++---------- 2 files changed, 55 insertions(+), 63 deletions(-) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index d5b1de834a..02aecfad82 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -102,7 +102,7 @@ def _test_processing_correctness( partial(random_video, rng, min_frames=2, - max_frames=8, + max_frames=16, min_wh=128, max_wh=256), "audio": diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index d3ddc47ea9..f8b30d8d98 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -53,6 +53,21 @@ IMAGE_TOKEN: str = "<|dummy3|>" VIDEO_TOKEN: str = "<|_unuse_missing_100270|>" +# Based on combine_frames_into_images in +# https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B/blob/main/processing_hyperclovax.py +def get_num_combined_frames( + num_frames: int, + max_grid_shape: tuple[int, int] = (3, 3), +) -> int: + max_num_grids = max_grid_shape[0] * max_grid_shape[1] + + # Calculate the number of canvases needed. + num_canvases = num_frames // max_num_grids + leftover_frames = num_frames % max_num_grids + + return num_canvases + (leftover_frames > 0) + + class HCXVisionMultimodalPixelInputs(TypedDict): type: Literal["pixel_values"] pixel_values_images: list[torch.Tensor] @@ -172,23 +187,20 @@ class HCXVisionMultiModalProcessor( def replace_multimodal_token( token_ids: torch.Tensor, target_token: int, - repeats: list, + repeats: list[int], ): - output = list() + output = list[int]() _repeats_idx = 0 for token_id in token_ids: if token_id == target_token: - output += [ - token_id.item(), - ] * repeats[_repeats_idx] + output += [token_id.item()] * repeats[_repeats_idx] _repeats_idx += 1 else: - output += [ - token_id.item(), - ] + output += [token_id.item()] + return torch.tensor(output, device=token_ids.device) - for video_idx, video_arr in enumerate(mm_data.get("videos", list())): + for video_idx, video_arr in enumerate(mm_data.get("videos", [])): if video_arr.dtype == np.uint8: continue mm_data["videos"][video_idx] = video_arr.astype(np.uint8) @@ -205,88 +217,68 @@ class HCXVisionMultiModalProcessor( if len(mm_data) > 0: # batchify input as a single item images = mm_data.get("images", None) - num_images = 0 - if images is not None: - num_images = len(images) - images = [ - images, - ] # batchify + batched_images = None if images is None else [images] - videos = mm_data.get("videos", - None) # list of video in single conversation - num_videos = 0 - if videos is not None: - num_videos = len(videos) - videos = [ - videos, - ] # batchify + # list of video in single conversation + videos = mm_data.get("videos", None) + batched_videos = None if videos is None else [videos] _processed_outputs = self.info.ctx.call_hf_processor( hf_processor=self.info.get_hf_processor(**mm_kwargs), data=dict( text=None, - images=images, - videos=videos, + images=batched_images, + videos=batched_videos, ), ) # mm-only for k, v in _processed_outputs.items(): - if len(v) < 1: - continue - elif k.endswith("_images"): - # list of list of 4D tensor -> list of 4D tensor + if isinstance(v, list) and len(v) > 0: + assert len(v) == 1 _processed_outputs[k] = v[0] - elif k.endswith("_videos"): - # list of list of 4D tensor -> list of 4D tensor - v = v[0] - if k == "pixel_values_videos": - v = torch.cat(v, dim=0) - _c, _w, _h = v.shape[-3:] - v = v.reshape(num_videos, -1, _c, _w, _h) - v = list(torch.unbind(v, dim=0)) - _processed_outputs[k] = v - if num_images > 0: + if images: tokenizer = self.info.get_tokenizer() + image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN) processed_outputs["input_ids"] = torch.stack([ replace_multimodal_token( token_ids=_input_ids, - target_token=tokenizer.convert_tokens_to_ids( - IMAGE_TOKEN), + target_token=image_token_id, repeats=_processed_outputs[ "vision_query_lengths_images"], ) for _input_ids in processed_outputs["input_ids"] ], dim=0) - if num_videos > 0: - tokenizer = self.info.get_tokenizer() - processed_outputs["input_ids"] = torch.stack([ - replace_multimodal_token( - token_ids=_input_ids, - target_token=tokenizer.convert_tokens_to_ids( - VIDEO_TOKEN), - repeats=_processed_outputs[ - "vision_query_lengths_videos"], - ) for _input_ids in processed_outputs["input_ids"] - ], - dim=0) - - _ratios = [ - len(_pixel_values) for _pixel_values in - _processed_outputs["pixel_values_videos"] - ] + if videos: _num_per_videos = [ - int(_e / sum(_ratios) * - len(_processed_outputs["vision_query_lengths_videos"])) - for _e in _ratios + get_num_combined_frames(len(video)) for video in videos + ] + _processed_outputs["pixel_values_videos"] = [ + _processed_outputs["pixel_values_videos"] + [sum(_num_per_videos[:_i]):sum(_num_per_videos[:_i + 1])] + for _i in range(len(videos)) ] _processed_outputs["vision_query_lengths_videos"] = [ _processed_outputs["vision_query_lengths_videos"] [sum(_num_per_videos[:_i]):sum(_num_per_videos[:_i + 1])] - for _i in range(0, num_videos) + for _i in range(len(videos)) ] + tokenizer = self.info.get_tokenizer() + video_token_id = tokenizer.convert_tokens_to_ids(VIDEO_TOKEN) + processed_outputs["input_ids"] = torch.stack([ + replace_multimodal_token( + token_ids=_input_ids, + target_token=video_token_id, + repeats=[ + sum(lens) for lens in + _processed_outputs["vision_query_lengths_videos"] + ], + ) for _input_ids in processed_outputs["input_ids"] + ], + dim=0) + processed_outputs.update(_processed_outputs) return processed_outputs From b17109beeafbf9577c319ab61530810943a7fc4b Mon Sep 17 00:00:00 2001 From: shixianc <49539556+shixianc@users.noreply.github.com> Date: Wed, 20 Aug 2025 07:35:26 -0700 Subject: [PATCH 431/932] [Kernel] CUTLASS MoE FP8: Integrate cuda moe permute/unpermute (#23045) Signed-off-by: Shixian Cui --- .../kernels/benchmark_grouped_gemm_cutlass.py | 35 +++- csrc/moe/moe_permute_unpermute_op.cu | 33 ++-- csrc/ops.h | 5 + .../cutlass_w8a8/moe/get_group_starts.cuh | 6 +- .../quantization/cutlass_w8a8/moe/moe_data.cu | 65 +++++-- .../cutlass_w8a8/scaled_mm_entry.cu | 24 +++ csrc/torch_bindings.cpp | 13 ++ tests/kernels/moe/test_cutlass_moe.py | 18 +- .../kernels/moe/test_moe_permute_unpermute.py | 6 +- tests/kernels/moe/test_pplx_cutlass_moe.py | 22 ++- .../quantization/test_cutlass_scaled_mm.py | 2 +- vllm/_custom_ops.py | 22 +++ .../layers/fused_moe/cutlass_moe.py | 179 +++++++++++------- .../layers/fused_moe/moe_permute_unpermute.py | 29 ++- .../compressed_tensors_moe.py | 31 +++ 15 files changed, 369 insertions(+), 121 deletions(-) diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index 1d4e730f99..a6b42406b5 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -80,6 +80,11 @@ def bench_run( a, score, topk, renormalize=False ) + ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) + ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64) + c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64) + c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) + def run_triton_moe( a: torch.Tensor, w1: torch.Tensor, @@ -111,6 +116,10 @@ def bench_run( w2: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, per_act_token: bool, @@ -125,6 +134,10 @@ def bench_run( topk_ids, w1_scale, w2_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, per_act_token, a1_scale=None, ) @@ -136,6 +149,10 @@ def bench_run( w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, ): @@ -150,6 +167,10 @@ def bench_run( topk_ids, w1_scale, w2_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, per_act_token, a1_scale=None, ) @@ -194,6 +215,10 @@ def bench_run( w2_q, w1_scale, w2_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, topk_weights, topk_ids, ) @@ -231,6 +256,10 @@ def bench_run( "w1_scale": w1_scale, "w2_scale": w2_scale, "per_act_token": per_act_token, + "ab_strides1": ab_strides1, + "ab_strides2": ab_strides2, + "c_strides1": c_strides1, + "c_strides2": c_strides2, # cuda graph params "cutlass_graph": cutlass_graph, "triton_graph": triton_graph, @@ -289,6 +318,10 @@ def bench_run( w2_q, w1_scale, w2_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, topk_weights, topk_ids, per_act_token, @@ -297,7 +330,7 @@ def bench_run( results.append( benchmark.Timer( - stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501 + stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu index 2922352a3f..ca0c873f49 100644 --- a/csrc/moe/moe_permute_unpermute_op.cu +++ b/csrc/moe/moe_permute_unpermute_op.cu @@ -45,8 +45,6 @@ void moe_permute( auto copy_topk_ids = topk_ids.clone(); // copy topk_ids for preprocess auto permuted_experts_id = torch::empty_like(topk_ids); auto sorted_row_idx = torch::empty_like(inv_permuted_idx); - auto align_expert_first_token_offset = - torch::zeros_like(expert_first_token_offset); CubKeyValueSorter sorter{}; int64_t* valid_num_ptr = nullptr; @@ -85,12 +83,14 @@ void moe_permute( }); // get m_indices and update expert_first_token_offset with align block - getMIndices(get_ptr(expert_first_token_offset), - get_ptr(align_expert_first_token_offset), - get_ptr(m_indices), n_local_expert, align_block_size_value, - stream); + // this is only required for DeepGemm and not required for CUTLASS group gemm if (align_block_size.has_value()) { - // update align_expert_first_token_offset + auto align_expert_first_token_offset = + torch::zeros_like(expert_first_token_offset); + getMIndices(get_ptr(expert_first_token_offset), + get_ptr(align_expert_first_token_offset), + get_ptr(m_indices), n_local_expert, align_block_size_value, + stream); expert_first_token_offset.copy_(align_expert_first_token_offset); } } @@ -195,19 +195,14 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights, torch::Tensor& expert_first_token_offset, torch::Tensor& src_row_id2dst_row_id_map, torch::Tensor& m_indices) { - TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0"); + TORCH_CHECK(false, "moe_permute is not supported on CUDA < 12.0"); } -void moe_unpermute(const torch::Tensor& input, - const torch::Tensor& topk_weights, torch::Tensor& topk_ids, - const torch::Tensor& token_expert_indices, - const std::optional& expert_map, - int64_t n_expert, int64_t n_local_expert, int64_t topk, - const std::optional& align_block_size, - torch::Tensor& permuted_input, - torch::Tensor& expert_first_token_offset, - torch::Tensor& src_row_id2dst_row_id_map, - torch::Tensor& m_indices) { +void moe_unpermute( + const torch::Tensor& permuted_hidden_states, + const torch::Tensor& topk_weights, const torch::Tensor& inv_permuted_idx, + const std::optional& expert_first_token_offset, int64_t topk, + torch::Tensor& hidden_states) { TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0"); } @@ -224,4 +219,4 @@ bool moe_permute_unpermute_supported() { TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { m.impl("moe_permute", &moe_permute); m.impl("moe_unpermute", &moe_unpermute); -} +} \ No newline at end of file diff --git a/csrc/ops.h b/csrc/ops.h index 64bcec6ca1..86fe848e2f 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -229,6 +229,11 @@ void get_cutlass_moe_mm_data( const int64_t num_experts, const int64_t n, const int64_t k, const std::optional& blockscale_offsets); +void get_cutlass_moe_mm_problem_sizes( + const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, + const int64_t k, const std::optional& blockscale_offsets); + void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, diff --git a/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh index 6c6e897908..15bb2c3005 100644 --- a/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh +++ b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh @@ -10,7 +10,7 @@ template __global__ void get_group_gemm_starts( - int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets, + int64_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets, ElementC** out_offsets, ElementAccumulator** a_scales_offsets, ElementAccumulator** b_scales_offsets, ElementAB* a_base_as_int, ElementAB* b_base_as_int, ElementC* out_base_as_int, @@ -34,7 +34,7 @@ __global__ void get_group_gemm_starts( else if (out_tensors.dtype() == TENSOR_C_TYPE) { \ get_group_gemm_starts \ <<<1, num_experts, 0, stream>>>( \ - static_cast(expert_offsets.data_ptr()), \ + static_cast(expert_offsets.data_ptr()), \ static_cast(a_ptrs.data_ptr()), \ static_cast(b_ptrs.data_ptr()), \ static_cast(out_ptrs.data_ptr()), \ @@ -61,6 +61,8 @@ void run_get_group_gemm_starts( TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn); TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); + // expect int64_t to avoid overflow during offset calculations + TORCH_CHECK(expert_offsets.dtype() == torch::kInt64); int num_experts = static_cast(expert_offsets.size(0)); bool per_act_token = a_scales.numel() != 1; diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu index 100f485084..49cafcc32a 100644 --- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu +++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu @@ -104,6 +104,53 @@ __global__ void compute_arg_sorts(const int32_t* __restrict__ topk_ids, } } +namespace { +inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids, + torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, + torch::Tensor& atomic_buffer, + int64_t num_experts, int64_t n, + int64_t k, cudaStream_t stream, + const bool swap_ab) { + int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel()); + + const int32_t* topk_ptr = static_cast(topk_ids.data_ptr()); + int32_t* ps1_ptr = static_cast(problem_sizes1.data_ptr()); + int32_t* ps2_ptr = static_cast(problem_sizes2.data_ptr()); + int32_t* atomic_ptr = static_cast(atomic_buffer.data_ptr()); + + if (swap_ab) { + compute_problem_sizes<<>>( + topk_ptr, ps1_ptr, ps2_ptr, atomic_ptr, + static_cast(topk_ids.numel()), static_cast(n), + static_cast(k)); + } else { + compute_problem_sizes<<>>( + topk_ptr, ps1_ptr, ps2_ptr, atomic_ptr, + static_cast(topk_ids.numel()), static_cast(n), + static_cast(k)); + } +} +} // namespace + +void get_cutlass_moe_mm_problem_sizes_caller( + const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, + const int64_t k, const std::optional& blockscale_offsets) { + auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index()); + auto options_int32 = + torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device()); + torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32); + + // Swap-AB should be disabled for FP4 path + bool may_swap_ab = (!blockscale_offsets.has_value()) && + (topk_ids.numel() <= SWAP_AB_THRESHOLD); + + launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2, + atomic_buffer, num_experts, n, k, stream, + may_swap_ab); +} + void get_cutlass_moe_mm_data_caller( const torch::Tensor& topk_ids, torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, @@ -121,21 +168,9 @@ void get_cutlass_moe_mm_data_caller( bool may_swap_ab = (!blockscale_offsets.has_value()) && (topk_ids.numel() <= SWAP_AB_THRESHOLD); - if (may_swap_ab) { - compute_problem_sizes<<>>( - static_cast(topk_ids.data_ptr()), - static_cast(problem_sizes1.data_ptr()), - static_cast(problem_sizes2.data_ptr()), - static_cast(atomic_buffer.data_ptr()), topk_ids.numel(), n, - k); - } else { - compute_problem_sizes<<>>( - static_cast(topk_ids.data_ptr()), - static_cast(problem_sizes1.data_ptr()), - static_cast(problem_sizes2.data_ptr()), - static_cast(atomic_buffer.data_ptr()), topk_ids.numel(), n, - k); - } + launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2, + atomic_buffer, num_experts, n, k, stream, + may_swap_ab); if (blockscale_offsets.has_value()) { // fp4 path diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 106bacb488..84843ee6e0 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -76,6 +76,11 @@ void get_cutlass_moe_mm_data_caller( const int64_t num_experts, const int64_t n, const int64_t k, const std::optional& blockscale_offsets); +void get_cutlass_moe_mm_problem_sizes_caller( + const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, + const int64_t k, const std::optional& blockscale_offsets); + void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, @@ -293,6 +298,25 @@ void get_cutlass_moe_mm_data( version_num, ". Required capability: 90 or 100"); } +void get_cutlass_moe_mm_problem_sizes( + const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, + const int64_t k, const std::optional& blockscale_offsets) { + int32_t version_num = get_sm_version_num(); +#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \ + (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) + get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1, + problem_sizes2, num_experts, n, k, + blockscale_offsets); + return; +#endif + TORCH_CHECK_NOT_IMPLEMENTED( + false, + "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm " + "kernel for CUDA device capability: ", + version_num, ". Required capability: 90 or 100"); +} + void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 7079671c2e..3a0ff6eaa7 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -440,6 +440,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { {stride_tag}); ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data); + // A function that computes problem sizes for each expert's multiplication + // used by the two mms called from fused MoE operation. It takes topk_ids as + // an input, and computes problem_sizes1 and problem_sizes2 only. + ops.def( + "get_cutlass_moe_mm_problem_sizes(Tensor topk_ids, " + " Tensor! problem_sizes1, " + " Tensor! problem_sizes2, " + " int num_experts, int n, int k, " + " Tensor? blockscale_offsets) -> ()", + {stride_tag}); + ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA, + &get_cutlass_moe_mm_problem_sizes); + // A function that computes data required to run fused MoE with w8a8 grouped // GEMM and PPLX. It takes expert_num_tokens and non_zero_expert_idxs // as an input, and computes expert_offsets (token start indices of each diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 81fb3ec1de..c84f66383b 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -207,6 +207,10 @@ def run_8_bit(moe_tensors: MOETensors8Bit, 'topk_ids': topk_ids, 'w1_scale': moe_tensors.w1_scale, 'w2_scale': moe_tensors.w2_scale, + 'ab_strides1': moe_tensors.ab_strides1, + 'ab_strides2': moe_tensors.ab_strides2, + 'c_strides1': moe_tensors.c_strides1, + 'c_strides2': moe_tensors.c_strides2, 'per_act_token': per_act_token, 'a1_scale': None #moe_tensors.a_scale } @@ -424,8 +428,8 @@ def test_run_cutlass_moe_fp8( topk_ids[0][1] = 1 workspace13_shape = (m * topk, max(2 * n, k)) - workspace2_shape = (m * topk, n) - output_shape = (m * topk, k) + workspace2_shape = (m * topk, max(n, k)) + output_shape = (m, k) workspace13 = torch.empty(prod(workspace13_shape), device="cuda", @@ -440,6 +444,11 @@ def test_run_cutlass_moe_fp8( expert_map[start:end] = list(range(num_local_experts)) expert_map = torch.tensor(expert_map, dtype=torch.int32, device="cuda") + ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64) + ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64) + c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64) + c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64) + activation = lambda o, i: torch.ops._C.silu_and_mul(o, i) a1q, a1q_scale = moe_kernel_quantize_input(mt.a, mt.a_scale, torch.float8_e4m3fn, @@ -448,8 +457,9 @@ def test_run_cutlass_moe_fp8( func = lambda output: run_cutlass_moe_fp8( output, a1q, mt.w1_q, mt.w2_q, topk_ids, activation, global_num_experts, expert_map, mt.w1_scale, mt.w2_scale, - a1q_scale, None, workspace13, workspace2, None, mt.a.dtype, - per_act_token, per_out_channel, False) + a1q_scale, None, ab_strides1, ab_strides2, c_strides1, c_strides2, + workspace13, workspace2, None, mt.a.dtype, per_act_token, + per_out_channel, False, topk_weights) workspace13.random_() output_random_workspace = torch.empty(output_shape, diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py index 6ca01f9271..d71664d94b 100644 --- a/tests/kernels/moe/test_moe_permute_unpermute.py +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -238,7 +238,11 @@ def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int, atol=0, rtol=0) # check mindice - torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0) + # current kernel usage assumes deepgemm requires align_block_size + # when it's not provided then we don't compute m_indices (for cutlass) + if align_block_size is not None: + torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0) + # check permuted_hidden_states, only valid token torch.testing.assert_close(gold_permuted_hidden_states[valid_row_idx], permuted_hidden_states[valid_row_idx], diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index f98937ee6c..98908f2714 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -76,6 +76,7 @@ def pplx_cutlass_moe( assert torch.cuda.current_device() == pgi.local_rank num_tokens, hidden_dim = a.shape + intermediate_dim = w2.shape[2] num_experts = w1.shape[0] block_size = hidden_dim # TODO support more cases device = pgi.device @@ -124,8 +125,27 @@ def pplx_cutlass_moe( num_local_experts=num_local_experts, num_dispatchers=num_dispatchers) + ab_strides1 = torch.full((num_local_experts, ), + hidden_dim, + device="cuda", + dtype=torch.int64) + ab_strides2 = torch.full((num_local_experts, ), + intermediate_dim, + device="cuda", + dtype=torch.int64) + c_strides1 = torch.full((num_local_experts, ), + 2 * intermediate_dim, + device="cuda", + dtype=torch.int64) + c_strides2 = torch.full((num_local_experts, ), + hidden_dim, + device="cuda", + dtype=torch.int64) + experts = CutlassBatchedExpertsFp8(num_local_experts, num_dispatchers, - out_dtype, per_act_token, per_out_ch) + out_dtype, per_act_token, per_out_ch, + ab_strides1, ab_strides2, c_strides1, + c_strides2) fused_cutlass_experts = FusedMoEModularKernel( prepare_finalize, diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py index 8730eeaaa7..a15decdf6f 100644 --- a/tests/kernels/quantization/test_cutlass_scaled_mm.py +++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py @@ -535,7 +535,7 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool, expert_offsets = torch.zeros((num_experts + 1), device=device, - dtype=torch.int32) + dtype=torch.int64) problem_sizes = torch.zeros((num_experts, 3), device=device, diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 0d556053f8..39da08847b 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -844,6 +844,28 @@ def get_cutlass_moe_mm_data(topk_ids: torch.Tensor, blockscale_offsets) +def get_cutlass_moe_mm_problem_sizes( + topk_ids: torch.Tensor, + problem_sizes1: torch.Tensor, + problem_sizes2: torch.Tensor, + num_experts: int, + n: int, + k: int, + blockscale_offsets: Optional[torch.Tensor] = None): + """ + Compute only the per-expert problem sizes needed by the two grouped matrix + multiplications used in CUTLASS-based fused MoE. + + The function takes in topk_ids (token→expert mapping) and computes: + - problem_sizes1, problem_sizes2: M×N×K sizes of each expert's + multiplication for the two grouped MMs + used in the fused MoE operation. + """ + return torch.ops._C.get_cutlass_moe_mm_problem_sizes( + topk_ids, problem_sizes1, problem_sizes2, num_experts, n, k, + blockscale_offsets) + + def shuffle_rows(input_tensor: torch.Tensor, dst2src_map: torch.Tensor): """ Shuffle and expand the input tensor according to the dst2src_map and store the result in output_tensor. diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 0a02b558d0..95d23ec034 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -9,12 +9,13 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( + moe_permute, moe_unpermute) from vllm.model_executor.layers.fused_moe.prepare_finalize import ( MoEPrepareAndFinalizeNoEP) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate, TopKWeightAndReduceNoOP) -from vllm.model_executor.layers.fused_moe.utils import (_fp8_perm, - _fp8_quantize, +from vllm.model_executor.layers.fused_moe.utils import (_fp8_quantize, _resize_cache) from vllm.scalar_type import scalar_types @@ -34,6 +35,10 @@ def run_cutlass_moe_fp8( w2_scale: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], a2_scale: Optional[torch.Tensor], + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, workspace13: torch.Tensor, workspace2: torch.Tensor, expert_num_tokens: Optional[torch.Tensor], @@ -41,6 +46,7 @@ def run_cutlass_moe_fp8( per_act_token: bool, per_out_ch: bool, use_batched_format: bool, + topk_weights: Optional[torch.Tensor], ): a1q = hidden_states @@ -99,6 +105,22 @@ def run_cutlass_moe_fp8( topk = local_topk_ids.size(1) local_E = w1.size(0) + if use_batched_format: + mm1_out = _resize_cache(workspace13, (local_E * padded_M, N * 2)) + act_out = _resize_cache(workspace2, (local_E * padded_M, N)) + quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn), + (local_E * padded_M, N)) + mm2_out = _resize_cache(workspace2, (local_E * padded_M, K)) + else: + a1q_perm = _resize_cache(workspace2.view(dtype=torch.float8_e4m3fn), + (M * topk, K)) + mm1_out = _resize_cache(workspace13, (M * topk, N * 2)) + act_out = _resize_cache(workspace2, (M * topk, N)) + # original workspace are based on input hidden_states dtype (bf16) + quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn), + (M * topk, N)) + mm2_out = _resize_cache(workspace2, (M * topk, K)) + if use_batched_format: assert expert_num_tokens is not None @@ -120,11 +142,10 @@ def run_cutlass_moe_fp8( w2_scale = w2_scale.reshape(w2_scale.size(0), -1) a1q = a1q.reshape(-1, a1q.size(2)) a1q_scale = a1q_scale.reshape(-1, a1q_scale.size(2)).contiguous() - + # c3x get_group_gemm_starts expects int64 to avoid overflow + # during offset calculations + expert_offsets = expert_offsets.to(torch.int64) else: - expert_offsets = torch.empty((global_num_experts + 1), - dtype=torch.int32, - device=device) problem_sizes1 = torch.empty((global_num_experts, 3), dtype=torch.int32, device=device) @@ -132,84 +153,57 @@ def run_cutlass_moe_fp8( dtype=torch.int32, device=device) - # With expert_map each Rank processes only a subset of experts. As - # a result not all of a_map and c2 tensors are filled. We fill it - # zeros for correctness. - if expert_map is not None: - a_map = torch.zeros((local_topk_ids.numel()), - dtype=torch.int32, - device=device) - else: - a_map = torch.empty((local_topk_ids.numel()), - dtype=torch.int32, - device=device) - - c_map = torch.empty((local_topk_ids.numel()), - dtype=torch.int32, - device=device) - - ops.get_cutlass_moe_mm_data(local_topk_ids, expert_offsets, - problem_sizes1, problem_sizes2, a_map, - c_map, global_num_experts, N, K) - - a1q = _fp8_perm(a1q, a_map) - a1q_scale = a1q_scale[a_map] if per_act_token else a1q_scale + num_expert = global_num_experts if expert_map is None \ + else expert_map.size(0) + # permuted a1q reuses workspace2 + a1q, a1q_scale, expert_offsets, inv_perm, _ = moe_permute( + a1q, + a1q_scale, + topk_ids, + num_expert, + local_E, + expert_map, + permuted_hidden_states=a1q_perm) expert_offsets = expert_offsets[:-1] - ab_strides1 = torch.full((w1.size(0), ), - K, - device=device, - dtype=torch.int64) - c_strides1 = torch.full((w1.size(0), ), - 2 * N, - device=device, - dtype=torch.int64) - ab_strides2 = torch.full((w1.size(0), ), - N, - device=device, - dtype=torch.int64) - c_strides2 = torch.full((w1.size(0), ), - K, - device=device, - dtype=torch.int64) - - if use_batched_format: - c1 = _resize_cache(workspace13, (local_E * padded_M, N * 2)) - c2 = _resize_cache(workspace2, (local_E * padded_M, N)) - c3 = _resize_cache(workspace13, (local_E * padded_M, K)) - else: - c1 = _resize_cache(workspace13, (M * topk, N * 2)) - c2 = _resize_cache(workspace2, (M * topk, N)) - c3 = _resize_cache(workspace13, (M * topk, K)) + ops.get_cutlass_moe_mm_problem_sizes(local_topk_ids, problem_sizes1, + problem_sizes2, + global_num_experts, N, K) if not per_act_token and (expert_map is not None or use_batched_format): # this is necessary to avoid imprecise scale calculation caused by # random data in the unused workspace. The workspace is unused when # this rank handles only partial tokens, or when it is batched . - c1.fill_(0) + mm1_out.fill_(0) - ops.cutlass_moe_mm(c1, a1q, w1, a1q_scale, w1_scale, expert_offsets, + ops.cutlass_moe_mm(mm1_out, a1q, w1, a1q_scale, w1_scale, expert_offsets, problem_sizes1, ab_strides1, ab_strides1, c_strides1, per_act_token, per_out_ch) - activation_callable(c2, c1) + activation_callable(act_out, mm1_out) a2q, a2q_scale = ops.scaled_fp8_quant( - c2, a2_scale, use_per_token_if_dynamic=per_act_token) + act_out, + a2_scale, + use_per_token_if_dynamic=per_act_token, + output=quant_out) if expert_map is not None: - c3.fill_(0) + mm2_out.fill_(0) - ops.cutlass_moe_mm(c3, a2q, w2, a2q_scale, w2_scale, expert_offsets, + ops.cutlass_moe_mm(mm2_out, a2q, w2, a2q_scale, w2_scale, expert_offsets, problem_sizes2, ab_strides2, ab_strides2, c_strides2, per_act_token, per_out_ch) if use_batched_format: - output.copy_(c3.reshape(local_E, padded_M, K), non_blocking=True) + output.copy_(mm2_out.reshape(local_E, padded_M, K), non_blocking=True) else: - # We can't do this inplace because output may point to the same tensor - # as c3. - output.copy_(c3[c_map].view(M * topk, K), non_blocking=True) + # for non-chunking mode the output is resized from workspace13 + # so we need to make sure mm2_out uses workspace2. + moe_unpermute(out=output, + permuted_hidden_states=mm2_out, + topk_weights=topk_weights, + inv_permuted_idx=inv_perm) class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): @@ -219,6 +213,10 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): out_dtype: Optional[torch.dtype], per_act_token_quant: bool, per_out_ch_quant: bool, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, block_shape: Optional[list[int]] = None, ): super().__init__( @@ -229,6 +227,10 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): block_shape=block_shape, )) self.out_dtype = out_dtype + self.ab_strides1 = ab_strides1 + self.ab_strides2 = ab_strides2 + self.c_strides1 = c_strides1 + self.c_strides2 = c_strides2 def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: # Let PrepareAndFinalize::finalize() decide the impl. @@ -272,10 +274,11 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): run_cutlass_moe_fp8( output, hidden_states, w1, w2, topk_ids, activation_callable, global_num_experts, expert_map, w1_scale, w2_scale, a1q_scale, - a2_scale, workspace13, workspace2, expert_num_tokens, + a2_scale, self.ab_strides1, self.ab_strides2, self.c_strides1, + self.c_strides2, workspace13, workspace2, expert_num_tokens, self.out_dtype if self.out_dtype is not None else in_dtype, self.per_act_token_quant, self.per_out_ch_quant, - use_batched_format) + use_batched_format, topk_weights) class CutlassExpertsFp8(CutlassExpertsFp8Base): @@ -285,12 +288,20 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base): out_dtype: Optional[torch.dtype], per_act_token_quant: bool, per_out_ch_quant: bool, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, block_shape: Optional[list[int]] = None, ): super().__init__( out_dtype, per_act_token_quant, per_out_ch_quant, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, block_shape, ) @@ -307,6 +318,10 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base): def supports_expert_map(self) -> bool: return True + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # topk weights and reduction are fused in moe_unpermute cuda kernel + return TopKWeightAndReduceNoOP() + def workspace_shapes( self, a: torch.Tensor, @@ -320,8 +335,8 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base): expert_tokens_meta: Optional[mk.ExpertTokensMetadata], ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: workspace1 = (M * topk, max(N, K)) - workspace2 = (M * topk, N // 2) - output = (M * topk, K) + workspace2 = (M * topk, max(N // 2, K)) + output = (M, K) return (workspace1, workspace2, output, self.out_dtype if self.out_dtype is not None else a.dtype) @@ -335,12 +350,20 @@ class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base): out_dtype: Optional[torch.dtype], per_act_token_quant: bool, per_out_ch_quant: bool, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, block_shape: Optional[list[int]] = None, ): super().__init__( out_dtype, per_act_token_quant, per_out_ch_quant, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, block_shape, ) assert max_experts_per_worker > 0 @@ -378,7 +401,8 @@ class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base): assert num_dp is not None workspace1 = (self.max_experts_per_worker, padded_M * num_dp, max(N, K)) - workspace2 = (self.max_experts_per_worker, padded_M * num_dp, (N // 2)) + workspace2 = (self.max_experts_per_worker, padded_M * num_dp, + max(N // 2, K)) output = (self.max_experts_per_worker, padded_M, K) return (workspace1, workspace2, output, self.out_dtype if self.out_dtype is not None else a.dtype) @@ -392,6 +416,10 @@ def cutlass_moe_fp8( topk_ids: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, per_act_token: Optional[bool] = None, activation: str = "silu", a1_scale: Optional[torch.Tensor] = None, @@ -419,6 +447,17 @@ def cutlass_moe_fp8( Shape: [num_experts] or [num_experts, 2N] - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q. Shape: [num_experts] or [num_experts, K] + - ab_strides1 (torch.Tensor): The input/weight strides for the first gemm. + Shape: [num_experts] + - ab_strides2 (torch.Tensor): The input/weight strides for the second gemm. + Shape: [num_experts] + - c_strides1 (torch.Tensor): The output strides for the first gemm. + Shape: [num_experts] + - c_strides2 (torch.Tensor): The output strides for the second gemm. + Shape: [num_experts] + - per_act_token (Optional[bool]): Whether the scale is per-token or + per-tensor. + - activation (str): The activation function to use. - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a. Shape: scalar or [M] - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to @@ -450,6 +489,10 @@ def cutlass_moe_fp8( out_dtype=a.dtype, per_act_token_quant=per_act_token, per_out_ch_quant=per_out_ch, + ab_strides1=ab_strides1, + ab_strides2=ab_strides2, + c_strides1=c_strides1, + c_strides2=c_strides2, ), ) diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py index d9059f50b4..16a155e718 100644 --- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py @@ -82,7 +82,8 @@ def moe_permute( n_local_expert: int = -1, expert_map: Optional[torch.Tensor] = None, align_block_size: Optional[int] = None, - fill_invalid_expert: int = -1 + fill_invalid_expert: int = -1, + permuted_hidden_states: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor]: """ @@ -95,14 +96,17 @@ def moe_permute( - n_expert (int): The number of expert. - n_local_expert (int): The number of expert in current EP rank. - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices - from the global expert space to the local expert space of the expert + from the global expert space to the local expert space of the expert parallel shard. - align_block_size (Optional[int]): align group gemm block size for deepgemm - fill_invalid_expert(int): fill expert id in m_indices for invalid expert to workaround DeepGemm unsupported -1 in m_indices + - permuted_hidden_states (Optional[torch.Tensor]): Optional output tensor. + If None, the output tensor will be created in this function. Returns: - permuted_hidden_states (torch.Tensor): permuted activation. - - a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states + - a1q_scale (Optional[torch.Tensor]): permuted quant scale for hidden_states + if original scale not per-tensor scaling - expert_first_token_offset (torch.Tensor): offset of the first token of each expert for standard grouped gemm. if enable 'align_block_size' expert_first_token_offset will align up to 'align_block_size'. @@ -122,11 +126,16 @@ def moe_permute( 1) // align_block_size * align_block_size if n_local_expert == -1: n_local_expert = n_expert - permuted_hidden_states = torch.empty( - (permuted_row_size, n_hidden), - dtype=hidden_states.dtype, - device=hidden_states.device, - ) + if permuted_hidden_states is None: + permuted_hidden_states = torch.empty( + (permuted_row_size, n_hidden), + dtype=hidden_states.dtype, + device=hidden_states.device, + ) + assert permuted_hidden_states.size() == (permuted_row_size, n_hidden), ( + f"Expected permuted hidden states to be {(permuted_row_size, n_hidden)}" + f" but got {permuted_hidden_states.size()}") + token_expert_indices = torch.arange(0, n_token * topk, dtype=torch.int32, @@ -153,7 +162,8 @@ def moe_permute( align_block_size, permuted_hidden_states, expert_first_token_offset, inv_permuted_idx, permuted_idx, m_indices) - if a1q_scale is not None: + + if a1q_scale is not None and a1q_scale.dim() > 1: a1q_scale = a1q_scale[permuted_idx.clamp(max=n_token * topk - 1) // topk] return (permuted_hidden_states, a1q_scale, expert_first_token_offset, @@ -185,6 +195,7 @@ def moe_unpermute( n_hidden = permuted_hidden_states.size(-1) assert (n_hidden * permuted_hidden_states.element_size() ) % 16 == 0, "unpermue kernel need hidden dim align to 16B" + torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights, inv_permuted_idx, expert_first_token_offset, topk, out) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 8ca8249e69..7bc35cd81a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -669,6 +669,25 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): from vllm.model_executor.layers.fused_moe import fused_experts self.fused_experts_func = fused_experts + if self.use_cutlass: + device = layer.w13_weight.device + # ab_strides1 and c_strides2 are the same + self.ab_strides1_c_strides2 = torch.full( + (layer.local_num_experts, ), + layer.hidden_size, + device=device, + dtype=torch.int64) + self.ab_strides2 = torch.full( + (layer.local_num_experts, ), + layer.intermediate_size_per_partition, + device=device, + dtype=torch.int64) + self.c_strides1 = torch.full( + (layer.local_num_experts, ), + 2 * layer.intermediate_size_per_partition, + device=device, + dtype=torch.int64) + def select_gemm_impl( self, prepare_finalize: FusedMoEPrepareAndFinalize, @@ -693,6 +712,10 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): moe.in_dtype, self.input_quant.strategy == QuantizationStrategy.TOKEN, self.weight_quant.strategy == QuantizationStrategy.CHANNEL, + ab_strides1=self.ab_strides1_c_strides2, + ab_strides2=self.ab_strides2, + c_strides1=self.c_strides1, + c_strides2=self.ab_strides1_c_strides2, ) else: logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__) @@ -700,6 +723,10 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): moe.in_dtype, self.input_quant.strategy == QuantizationStrategy.TOKEN, self.weight_quant.strategy == QuantizationStrategy.CHANNEL, + ab_strides1=self.ab_strides1_c_strides2, + ab_strides2=self.ab_strides2, + c_strides1=self.c_strides1, + c_strides2=self.ab_strides1_c_strides2, ) self.disable_expert_map = (num_dispatchers > 1 @@ -822,6 +849,10 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): expert_map=None if self.disable_expert_map else expert_map, w1_scale=layer.w13_weight_scale, w2_scale=layer.w2_weight_scale, + ab_strides1=self.ab_strides1_c_strides2, + ab_strides2=self.ab_strides2, + c_strides1=self.c_strides1, + c_strides2=self.ab_strides1_c_strides2, a1_scale=layer.w13_input_scale, a2_scale=layer.w2_input_scale, ) From 5efd6905bc8469a30664de83bdafaad56aa92903 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 20 Aug 2025 23:42:28 +0800 Subject: [PATCH 432/932] [CLI][Doc] Formalize `--mm-encoder-tp-mode` (#23190) Signed-off-by: DarkLight1337 --- docs/configuration/optimization.md | 45 ++++++++++++++++++++++++ vllm/config/__init__.py | 34 +++++++++++++++++- vllm/config/parallel.py | 4 --- vllm/engine/arg_utils.py | 35 +++++++++++------- vllm/model_executor/models/mllama4.py | 4 +-- vllm/model_executor/models/qwen2_5_vl.py | 3 +- vllm/model_executor/models/step3_vl.py | 3 +- 7 files changed, 104 insertions(+), 24 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index c7f50497d6..db9dfb313f 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -129,6 +129,51 @@ Data parallelism replicates the entire model across multiple GPU sets and proces Data parallelism can be combined with the other parallelism strategies and is set by `data_parallel_size=N`. Note that MoE layers will be sharded according to the product of the tensor parallel size and data parallel size. +### Batch-level DP for Multi-Modal Encoders + +By default, TP is used to shard the weights of multi-modal encoders just like for language decoders, +in order to reduce the memory and compute load on each GPU. + +However, since the size of multi-modal encoders is very small compared to language decoders, +there is relatively little gain from TP. On the other hand, TP incurs significant communication +overhead because of all-reduce being performed after every layer. + +Given this, it may be advantageous to instead shard the batched input data using TP, essentially +performing batch-level DP. This has been shown to improve the throughput by around 10% for +`tensor_parallel_size=8`. For vision encoders that use hardware-unoptimized Conv3D operations, +batch-level DP can provide another 40% increase to throughput compared to regular TP. + +Nevertheless, since the weights of the multi-modal encoder are replicated across each TP rank, +there will be a minor increase in memory consumption and may cause OOM if you can barely fit the model already. + +You can enable batch-level DP by setting `mm_encoder_tp_mode="data"`, for example: + +```python +from vllm import LLM + +llm = LLM( + model="Qwen/Qwen2.5-VL-72B-Instruct", + # Create two EngineCore instances, one per DP rank + data_parallel_size=2, + # Within each EngineCore instance: + # The vision encoder uses TP=4 (not DP=2) to shard the input data + # The language decoder uses TP=4 to shard the weights as usual + tensor_parallel_size=4, + mm_encoder_tp_mode="data", +) +``` + +!! important + Batch-level DP is not to be confused with API request-level DP + (which is instead controlled by `data_parallel_size`). + +The availablilty of batch-level DP is based on model implementation. +Currently, the following models support `mm_encoder_tp_mode="data"`: + +- Llama4 () +- Qwen2.5-VL () +- Step3 () + ## Input Processing ### Parallel Processing diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 801fa97fe5..5b5d477ef0 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -258,6 +258,7 @@ TokenizerMode = Literal["auto", "slow", "mistral", "custom"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] LogprobsMode = Literal["raw_logprobs", "raw_logits", "processed_logprobs", "processed_logits"] +MMEncoderTPMode = Literal["weights", "data"] @config @@ -438,6 +439,19 @@ class ModelConfig: `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. Set to `0` to disable this cache completely (not recommended).""" + mm_encoder_tp_mode: MMEncoderTPMode = "weights" + """Indicates how to optimize multi-modal encoder inference using + tensor parallelism (TP). + + - `"weights"`: Within the same vLLM engine, split the weights of + each layer across TP ranks. (default TP behavior) + - `"data"`: Within the same vLLM engine, split the batched input data + across TP ranks to process the data in parallel, while hosting + the full weights on each TP rank. + This batch-level DP is not to be confused with API request-level + DP (which is controlled by `--data-parallel-size`). + This is only supported on a per-model basis and falls back to + `"weights"` if the encoder does not support DP.""" override_neuron_config: dict[str, Any] = field(default_factory=dict) """Initialize non-default neuron config or override default neuron config that are specific to Neuron devices, this argument will be used to @@ -856,8 +870,10 @@ class ModelConfig: media_io_kwargs=self.media_io_kwargs, mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_cache_gb=self.mm_processor_cache_gb, + mm_encoder_tp_mode=self.mm_encoder_tp_mode, interleave_mm_strings=self.interleave_mm_strings, - skip_mm_profiling=self.skip_mm_profiling) + skip_mm_profiling=self.skip_mm_profiling, + ) return None @@ -2547,6 +2563,22 @@ class MultiModalConfig: Set to `0` to disable this cache completely (not recommended). """ + mm_encoder_tp_mode: MMEncoderTPMode = "weights" + """ + Indicates how to optimize multi-modal encoder inference using + tensor parallelism (TP). + + - `"weights"`: Within the same vLLM engine, split the weights of + each layer across TP ranks. (default TP behavior) + - `"data"`: Within the same vLLM engine, split the batched input data + across TP ranks to process the data in parallel, while hosting + the full weights on each TP rank. + This batch-level DP is not to be confused with API request-level + DP (which is controlled by `--data-parallel-size`). + This is only supported on a per-model basis and falls back to + `"weights"` if the encoder does not support DP. + """ + interleave_mm_strings: bool = False """ Enable fully interleaved support for multimodal prompts. diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index bac1e63800..7a9e68f0ea 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -137,10 +137,6 @@ class ParallelConfig: rank: int = 0 """Global rank in distributed setup.""" - enable_multimodal_encoder_data_parallel: bool = False - """ Use data parallelism instead of tensor parallelism for vision encoder. - Only support LLama4 for now""" - @property def world_size_across_dp(self) -> int: """world_size_across_dp is TPxPPxDP, it is the size of the world diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 48d9cd08af..6869c3f23f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -28,12 +28,12 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, DeviceConfig, DistributedExecutorBackend, GuidedDecodingBackend, HfOverrides, KVEventsConfig, KVTransferConfig, LoadConfig, LogprobsMode, - LoRAConfig, MambaDType, ModelConfig, ModelDType, - ModelImpl, MultiModalConfig, ObservabilityConfig, - ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, - RunnerOption, SchedulerConfig, SchedulerPolicy, - SpeculativeConfig, TaskOption, TokenizerMode, - VllmConfig, get_attr_docs, get_field) + LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig, + ModelDType, ModelImpl, MultiModalConfig, + ObservabilityConfig, ParallelConfig, PoolerConfig, + PrefixCachingHashAlgo, RunnerOption, SchedulerConfig, + SchedulerPolicy, SpeculativeConfig, TaskOption, + TokenizerMode, VllmConfig, get_attr_docs, get_field) from vllm.logger import init_logger from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins @@ -352,6 +352,7 @@ class EngineArgs: MultiModalConfig.mm_processor_kwargs disable_mm_preprocessor_cache: bool = False # DEPRECATED mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb + mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling # LoRA fields enable_lora: bool = False @@ -434,16 +435,14 @@ class EngineArgs: use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load pt_load_map_location: str = LoadConfig.pt_load_map_location - enable_multimodal_encoder_data_parallel: bool = \ - ParallelConfig.enable_multimodal_encoder_data_parallel + # DEPRECATED + enable_multimodal_encoder_data_parallel: bool = False logits_processors: Optional[list[Union[ str, type[LogitsProcessor]]]] = ModelConfig.logits_processors """Custom logitproc types""" async_scheduling: bool = SchedulerConfig.async_scheduling - # DEPRECATED - enable_prompt_adapter: bool = False kv_sharing_fast_prefill: bool = \ CacheConfig.kv_sharing_fast_prefill @@ -685,7 +684,8 @@ class EngineArgs: **parallel_kwargs["worker_extension_cls"]) parallel_group.add_argument( "--enable-multimodal-encoder-data-parallel", - **parallel_kwargs["enable_multimodal_encoder_data_parallel"]) + action="store_true", + deprecated=True) # KV cache arguments cache_kwargs = get_kwargs(CacheConfig) @@ -735,6 +735,8 @@ class EngineArgs: multimodal_group.add_argument("--disable-mm-preprocessor-cache", action="store_true", deprecated=True) + multimodal_group.add_argument( + "--mm-encoder-tp-mode", **multimodal_kwargs["mm_encoder_tp_mode"]) multimodal_group.add_argument( "--interleave-mm-strings", **multimodal_kwargs["interleave_mm_strings"]) @@ -909,6 +911,14 @@ class EngineArgs: self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB + if self.enable_multimodal_encoder_data_parallel: + logger.warning( + "--enable-multimodal-encoder-data-parallel` is deprecated " + "and will be removed in v0.13. " + "Please use `--mm-encoder-tp-mode data` instead.") + + self.mm_encoder_tp_mode = "data" + return ModelConfig( model=self.model, hf_config_path=self.hf_config_path, @@ -947,6 +957,7 @@ class EngineArgs: config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_cache_gb=self.mm_processor_cache_gb, + mm_encoder_tp_mode=self.mm_encoder_tp_mode, override_neuron_config=self.override_neuron_config, override_pooler_config=self.override_pooler_config, logits_processor_pattern=self.logits_processor_pattern, @@ -1258,8 +1269,6 @@ class EngineArgs: distributed_executor_backend=self.distributed_executor_backend, worker_cls=self.worker_cls, worker_extension_cls=self.worker_extension_cls, - enable_multimodal_encoder_data_parallel=self. - enable_multimodal_encoder_data_parallel, ) if model_config.is_multimodal_model: diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 35103eac8f..595bdd17cf 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -728,8 +728,8 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config - self.use_data_parallel = (vllm_config.parallel_config. - enable_multimodal_encoder_data_parallel) + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" + self.config = config self.quant_config = quant_config self.multimodal_config = multimodal_config diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 34eec10296..811ecffcc1 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -877,8 +877,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config multimodal_config = vllm_config.model_config.multimodal_config - self.use_data_parallel = (vllm_config.parallel_config. - enable_multimodal_encoder_data_parallel) + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self.config = config self.multimodal_config = multimodal_config diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 5d41a9e569..f8877b584b 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -882,8 +882,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, self.config = config self.multimodal_config = multimodal_config - self.use_data_parallel = (vllm_config.parallel_config. - enable_multimodal_encoder_data_parallel) + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" if multimodal_config.get_limit_per_prompt("image"): self.vision_model = Step3VisionTransformer( From d6d13bd49ed7fda56ac6a1b0aa53621490c975ac Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 20 Aug 2025 09:05:29 -0700 Subject: [PATCH 433/932] [Misc] Add max_seq_len to CommonAttentionMetadata (#23216) Signed-off-by: Woosuk Kwon --- tests/v1/attention/utils.py | 2 ++ tests/v1/spec_decode/test_tree_attention.py | 2 ++ vllm/v1/attention/backends/flash_attn.py | 2 +- vllm/v1/attention/backends/flashinfer.py | 2 +- vllm/v1/attention/backends/flex_attention.py | 2 +- vllm/v1/attention/backends/rocm_aiter_fa.py | 2 +- vllm/v1/attention/backends/tree_attn.py | 2 +- vllm/v1/attention/backends/triton_attn.py | 2 +- vllm/v1/attention/backends/utils.py | 6 ++++++ vllm/v1/attention/backends/xformers.py | 2 +- vllm/v1/spec_decode/eagle.py | 1 + vllm/v1/worker/gpu_model_runner.py | 4 ++++ 12 files changed, 22 insertions(+), 7 deletions(-) diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index a4e38eb32f..e547e71e0c 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -58,6 +58,7 @@ def create_common_attn_metadata( dtype=torch.int32, device=device) seq_lens_cpu = seq_lens.cpu() + max_seq_len = int(seq_lens_cpu.max()) # Create computed tokens (context length for each sequence) context_lens = [ @@ -101,6 +102,7 @@ def create_common_attn_metadata( num_reqs=batch_spec.batch_size, num_actual_tokens=num_tokens, max_query_len=max_query_len, + max_seq_len=max_seq_len, block_table_tensor=block_table_tensor, slot_mapping=slot_mapping, causal=True, diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py index 456ce712d3..6317817408 100644 --- a/tests/v1/spec_decode/test_tree_attention.py +++ b/tests/v1/spec_decode/test_tree_attention.py @@ -50,6 +50,7 @@ def forward_attention( dtype=torch.int32, ) context_lens = seq_lens - query_lens + max_seq_len = int(seq_lens.max()) max_query_len = q_len num_actual_tokens = query_start_loc[-1] @@ -81,6 +82,7 @@ def forward_attention( num_reqs=batch_size, num_actual_tokens=num_actual_tokens, max_query_len=max_query_len, + max_seq_len=max_seq_len, block_table_tensor=block_table, slot_mapping=slot_mapping, ) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index ab7a71a399..eed3cba9a2 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -233,7 +233,7 @@ class FlashAttentionMetadataBuilder( num_reqs = common_attn_metadata.num_reqs num_actual_tokens = common_attn_metadata.num_actual_tokens max_query_len = common_attn_metadata.max_query_len - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens seq_lens_cpu = common_attn_metadata.seq_lens_cpu diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 53fafbc4af..8a25088848 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -463,7 +463,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): page_size = self.page_size max_q_len = common_attn_metadata.max_query_len - max_seq_len = common_attn_metadata.seq_lens_cpu.max().item() + max_seq_len = common_attn_metadata.max_seq_len seq_lens = common_attn_metadata.seq_lens seq_lens_cpu = common_attn_metadata.seq_lens_cpu block_table_tensor = common_attn_metadata.block_table_tensor diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index e599411b2d..abca981035 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -305,7 +305,7 @@ class FlexAttentionMetadataBuilder( num_actual_tokens = common_attn_metadata.num_actual_tokens max_query_len = common_attn_metadata.max_query_len - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens block_table_tensor = common_attn_metadata.block_table_tensor diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 36b5853bfd..b9ff113573 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -270,7 +270,7 @@ class AiterFlashAttentionMetadataBuilder( num_actual_tokens = common_attn_metadata.num_actual_tokens max_query_len = common_attn_metadata.max_query_len - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens block_table_tensor = common_attn_metadata.block_table_tensor diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py index 5d10e9e260..2a0c52377c 100644 --- a/vllm/v1/attention/backends/tree_attn.py +++ b/vllm/v1/attention/backends/tree_attn.py @@ -205,7 +205,7 @@ class TreeAttentionMetadataBuilder( q_start_loc = common_attn_metadata.query_start_loc max_query_len = common_attn_metadata.max_query_len kv_seqlens = common_attn_metadata.seq_lens - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len block_table = common_attn_metadata.block_table_tensor slot_mapping = common_attn_metadata.slot_mapping diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 48a9af3dec..c69dd8415f 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -90,7 +90,7 @@ class TritonAttentionMetadataBuilder( num_actual_tokens = common_attn_metadata.num_actual_tokens max_query_len = common_attn_metadata.max_query_len - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens block_table_tensor = common_attn_metadata.block_table_tensor diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 94dd3d2629..57c4d436c5 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -58,6 +58,8 @@ class CommonAttentionMetadata: """Total number of tokens in batch""" max_query_len: int """Longest query in batch""" + max_seq_len: int + """Longest context length in batch""" block_table_tensor: torch.Tensor slot_mapping: torch.Tensor @@ -107,6 +109,7 @@ def _make_metadata_with_slice( seq_lens = attn_metadata.seq_lens[request_slice] seq_lens_cpu = attn_metadata.seq_lens_cpu[request_slice] + max_seq_len = int(seq_lens_cpu.max()) num_computed_tokens_cpu = attn_metadata.num_computed_tokens_cpu[ request_slice] @@ -128,6 +131,7 @@ def _make_metadata_with_slice( num_reqs=num_requests, num_actual_tokens=num_actual_tokens, max_query_len=max_query_len, + max_seq_len=max_seq_len, block_table_tensor=block_table_tensor, slot_mapping=slot_mapping, ) @@ -520,6 +524,7 @@ def make_local_attention_virtual_batches( query_start_loc_cpu = torch.from_numpy(cu_seqlens_q_local) seq_lens_cpu = torch.from_numpy(seqlens_k_local) + max_seq_len = int(seq_lens_cpu.max()) return CommonAttentionMetadata( query_start_loc_cpu=query_start_loc_cpu, @@ -531,6 +536,7 @@ def make_local_attention_virtual_batches( num_reqs=len(seq_lens_cpu), num_actual_tokens=common_attn_metadata.num_actual_tokens, max_query_len=seqlens_q_local.max(), + max_seq_len=max_seq_len, block_table_tensor=block_table_local, slot_mapping=common_attn_metadata.slot_mapping, causal=True, diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py index fe732c6017..b305bc1539 100644 --- a/vllm/v1/attention/backends/xformers.py +++ b/vllm/v1/attention/backends/xformers.py @@ -231,7 +231,7 @@ class XFormersAttentionMetadataBuilder( q_seqlens = torch.diff(q_start_loc) max_query_len = common_attn_metadata.max_query_len kv_seqlens = common_attn_metadata.seq_lens - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len block_table = common_attn_metadata.block_table_tensor slot_mapping = common_attn_metadata.slot_mapping diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 8cd2ad12cf..cc2b2a139d 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -582,6 +582,7 @@ class EagleProposer: num_reqs=common_attn_metadata.num_reqs, num_actual_tokens=total_num_tokens, max_query_len=new_query_len_per_req.max().item(), + max_seq_len=new_seq_lens_cpu.max().item(), block_table_tensor=common_attn_metadata.block_table_tensor, slot_mapping=common_attn_metadata.slot_mapping[token_indices], causal=True, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e0bab3367c..d9770226b1 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -774,6 +774,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.seq_lens_np[num_reqs:].fill(0) self.seq_lens.copy_(self.seq_lens_cpu, non_blocking=True) seq_lens = self.seq_lens[:num_reqs] + max_seq_len = self.seq_lens_np[:num_reqs].max().item() # Copy the tensors to the GPU. self.input_ids[:total_num_scheduled_tokens].copy_( @@ -886,6 +887,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_reqs=num_reqs, num_actual_tokens=total_num_scheduled_tokens, max_query_len=max_num_scheduled_tokens, + max_seq_len=max_seq_len, block_table_tensor=blk_table_tensor, slot_mapping=slot_mapping, causal=True, @@ -2338,6 +2340,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_reqs=num_reqs, num_actual_tokens=num_tokens, max_query_len=max_query_len, + max_seq_len=self.max_model_len, block_table_tensor=self.input_batch.block_table[ kv_cache_group_id].get_device_tensor()[:num_reqs], slot_mapping=self.input_batch. @@ -3343,6 +3346,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_reqs=num_reqs, num_actual_tokens=total_num_scheduled_tokens, max_query_len=max_num_scheduled_tokens, + max_seq_len=self.seq_lens_cpu[:num_reqs].max().item(), block_table_tensor=dummy_block_table, slot_mapping=dummy_slot_mapping, causal=False, From 3b11b26b5069718a6bde11b9041681bc17369f96 Mon Sep 17 00:00:00 2001 From: JartX Date: Wed, 20 Aug 2025 18:08:29 +0200 Subject: [PATCH 434/932] [FIXBUG ] Allow disabling rocm_aiter_fa backend for ROCm GPUs not compatible with AITER (#22795) Signed-off-by: JartX Signed-off-by: tjtanaa Co-authored-by: tjtanaa --- vllm/v1/spec_decode/eagle.py | 80 ++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index cc2b2a139d..0a0e9fed72 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -2,7 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ast from dataclasses import replace -from typing import Optional +from importlib.util import find_spec +from typing import Optional, Protocol import numpy as np import torch @@ -20,8 +21,6 @@ from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.platforms import current_platform from vllm.utils import is_pin_memory_available from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata -from vllm.v1.attention.backends.rocm_aiter_fa import ( - AiterFlashAttentionMetadata) from vllm.v1.attention.backends.tree_attn import (TreeAttentionMetadata, TreeAttentionMetadataBuilder) from vllm.v1.attention.backends.triton_attn import TritonAttentionMetadata @@ -34,6 +33,17 @@ logger = init_logger(__name__) PADDING_SLOT_ID = -1 +class EagleAttentionMetadata(Protocol): + # Required attributes + num_actual_tokens: int + max_query_len: int + query_start_loc: torch.Tensor + max_seq_len: int + seq_lens: torch.Tensor + block_table: torch.Tensor + slot_mapping: torch.Tensor + + class EagleProposer: def __init__( @@ -97,6 +107,20 @@ class EagleProposer: dtype=self.dtype, device=device) + # Determine allowed attention backends once during initialization. + self.allowed_attn_types: tuple[type[EagleAttentionMetadata], ...] + if current_platform.is_rocm(): + rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata] + # vllm.v1.attention.backends.rocm_aiter_fa is an optional backend + if find_spec("vllm.v1.attention.backends.rocm_aiter_fa"): + from vllm.v1.attention.backends.rocm_aiter_fa import ( + AiterFlashAttentionMetadata) + rocm_types.append(AiterFlashAttentionMetadata) + self.allowed_attn_types = tuple(rocm_types) + else: + self.allowed_attn_types = (FlashAttentionMetadata, + TreeAttentionMetadata) + # Parse the speculative token tree. spec_token_tree = self.speculative_config.speculative_token_tree self.tree_choices: list[tuple[int, @@ -165,7 +189,7 @@ class EagleProposer: for layer_name in self.attn_layer_names: per_layer_attn_metadata[layer_name] = attn_metadata if self.use_cuda_graph and \ - num_tokens <= self.cudagraph_batch_sizes[-1]: + num_tokens <= self.cudagraph_batch_sizes[-1]: num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) else: num_input_tokens = num_tokens @@ -225,25 +249,13 @@ class EagleProposer: # TODO: Currently, MTP module released by deepseek only has # one layer. Adapt this code to support multiple layers once # there's a multi-layer MTP module. - - # On ROCm, both AiterFlashAttention and TritonAttention - # support multi-token eagle spec decode. - if current_platform.is_rocm(): - assert isinstance( - attn_metadata, - (TritonAttentionMetadata, AiterFlashAttentionMetadata, - FlashAttentionMetadata)) - else: - # Currently, only FlashAttention supports multi-token eagle spec - # decode. This is because the code below makes assumptions about - # attn_metadata attributes available. - assert isinstance(attn_metadata, FlashAttentionMetadata) + assert isinstance(attn_metadata, self.allowed_attn_types) # Generate the remaining draft tokens. draft_token_ids_list = [draft_token_ids] if self.use_cuda_graph and \ - batch_size <= self.cudagraph_batch_sizes[-1]: + batch_size <= self.cudagraph_batch_sizes[-1]: input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size) else: input_batch_size = batch_size @@ -449,7 +461,7 @@ class EagleProposer: num_tokens, -1) if self.use_cuda_graph and \ - num_tokens <= self.cudagraph_batch_sizes[-1]: + num_tokens <= self.cudagraph_batch_sizes[-1]: num_input_tokens = self.vllm_config.pad_for_cudagraph( num_tokens) else: @@ -508,19 +520,19 @@ class EagleProposer: """ # E.g. # common_attn_metadata.query_start_loc{_cpu}: - # [0, q1, q1 + q2, q1 + q2 + q3] + # [0, q1, q1 + q2, q1 + q2 + q3] # common_attn_metadata.seq_lens{_cpu}: [s1, s2, s3] # num_rejected_tokens: [n1, n2, n3] # This function computes the intermediate values: # num_tokens_per_req: [q1 - n1, q2 - n2, q3 - n3] # And returns: # common_attn_metadata.query_start_loc{_cpu}: - # [0, q1 - n1, q1 + q2 - n1 - n2, q1 + q2 + q3 - n1 - n2 - n3] + # [0, q1 - n1, q1 + q2 - n1 - n2, q1 + q2 + q3 - n1 - n2 - n3] # common_attn_metadata.seq_lens{_cpu}: - # [s1 - n1 + 1, s2 - n2 + 1, s3 - n3 + 1] + # [s1 - n1 + 1, s2 - n2 + 1, s3 - n3 + 1] # token_indices: [0, 1, ..., q1 - n1 - 1, - # q1, q1 + 1, ..., q1 + q2 - n2 - 1, - # q1 + q2, q1 + q2 + 1, ..., q1 + q2 + q3 - n3 - 1] + # q1, q1 + 1, ..., q1 + q2 - n2 - 1, + # q1 + q2, q1 + q2 + 1, ..., q1 + q2 + q3 - n3 - 1] device = common_attn_metadata.query_start_loc.device query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu @@ -564,9 +576,9 @@ class EagleProposer: old_query_start_locs_expanded = np.repeat( query_start_loc_cpu[:-1].numpy(), new_num_tokens_per_req_np) # Final token indices are: - # [0, 1, // req 1 - # q1 + 0, q1 + 1, q1 + 2, q1 + 3, // req 2 - # q1 + q2 + 0, q1 + q2 + 1, q1 + q2 + 2] // req 3 + # [0, 1, // req 1 + # q1 + 0, q1 + 1, q1 + 2, q1 + 3, // req 2 + # q1 + q2 + 0, q1 + q2 + 1, q1 + q2 + 2] // req 3 token_indices_np = token_offests + old_query_start_locs_expanded token_indices = torch.from_numpy(token_indices_np).to( device, non_blocking=True) @@ -616,20 +628,18 @@ class EagleProposer: target_language_model = target_model # share embed_tokens with the target model if needed if get_pp_group().world_size == 1 \ - and self.model.model.embed_tokens.weight.shape \ - == target_language_model.model.embed_tokens.weight.shape: + and self.model.model.embed_tokens.weight.shape \ + == target_language_model.model.embed_tokens.weight.shape: logger.info( - "Assuming the EAGLE head shares the same vocab embedding" \ - " with the target model." - ) + "Assuming the EAGLE head shares the same vocab embedding" + " with the target model.") del self.model.model.embed_tokens self.model.model.embed_tokens = ( target_language_model.model.embed_tokens) else: logger.info( - "The EAGLE head's vocab embedding will be loaded separately" \ - " from the target model." - ) + "The EAGLE head's vocab embedding will be loaded separately" + " from the target model.") # share lm_head with the target model if needed # some model definition do not define lm_head explicitly From dfd2382039c38be80d6c2c9b56e441b5bd7cd0ad Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Wed, 20 Aug 2025 09:52:59 -0700 Subject: [PATCH 435/932] [torch.compile] Support conditional torch.compile per module (#22269) Signed-off-by: Yong Hoon Shin --- .buildkite/test-pipeline.yaml | 2 + .../compile/piecewise/test_multiple_graphs.py | 135 +++------- tests/compile/test_decorator.py | 251 ++++++++++++++++++ vllm/compilation/decorators.py | 21 +- 4 files changed, 307 insertions(+), 102 deletions(-) create mode 100644 tests/compile/test_decorator.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 2f7f1db75b..7454206640 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -328,6 +328,7 @@ steps: - pytest -v -s compile/test_sequence_parallelism.py - pytest -v -s compile/test_async_tp.py - pytest -v -s compile/test_fusion_all_reduce.py + - pytest -v -s compile/test_decorator.py - label: PyTorch Fullgraph Smoke Test # 9min mirror_hardwares: [amdexperimental] @@ -341,6 +342,7 @@ steps: - pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_toy_llama.py - pytest -v -s compile/piecewise/test_full_cudagraph.py + - pytest -v -s compile/piecewise/test_multiple_graphs.py - label: PyTorch Fullgraph Test # 18min mirror_hardwares: [amdexperimental] diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py index e460d70951..f5e2d9ddb7 100644 --- a/tests/compile/piecewise/test_multiple_graphs.py +++ b/tests/compile/piecewise/test_multiple_graphs.py @@ -12,10 +12,9 @@ from vllm.compilation.backends import set_model_tag from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import (ignore_torch_compile, support_torch_compile) -from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, - set_current_vllm_config) -from vllm.envs import VLLM_USE_V1 -from vllm.forward_context import set_forward_context +from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode, + VllmConfig, set_current_vllm_config) +from vllm.forward_context import BatchDescriptor, set_forward_context from vllm.utils import direct_register_custom_op # create a library to hold the custom op @@ -164,104 +163,34 @@ class SimpleModelWithTwoGraphs(ParentModel): return x -def test_ignore_torch_compile_decorator(): - assert VLLM_USE_V1 - - # piecewise - vllm_config = VllmConfig(compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, - use_cudagraph=True, - splitting_ops=["silly.attention"], - cudagraph_capture_sizes=[1, 2], - )) - - @support_torch_compile - class A(nn.Module): - - def __init__(self, - *, - vllm_config: VllmConfig, - prefix: str = '', - **kwargs) -> None: - super().__init__() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = x + x - attn_output = torch.empty_like(x) - torch.ops.silly.attention(x, x, x, attn_output) - x = attn_output - x = x * 3 - return x - - @ignore_torch_compile - class B(A): - ... - - @support_torch_compile - class C(B): - ... - - with set_current_vllm_config(vllm_config): - mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda() - - # A has support_torch_compile - with compilation_counter.expect( - num_graphs_seen=1, - num_piecewise_graphs_seen=3, - num_piecewise_capturable_graphs_seen=2, - num_backend_compilations=2, - num_cudagraph_captured=4, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen - ), set_forward_context({}, vllm_config=vllm_config): - # first run is for compile - mod_A(torch.randn(BATCH_SIZE, MLP_SIZE).cuda()) - # run cudagraph captured sizes - mod_A(torch.randn(2, MLP_SIZE).cuda()) - mod_A(torch.randn(1, MLP_SIZE).cuda()) - - with set_current_vllm_config(vllm_config): - mod_B = B(vllm_config=vllm_config, prefix='').eval().cuda() - - # B's ignore_torch_compile should override A's support_torch_compile - with compilation_counter.expect( - num_graphs_seen=0, - num_piecewise_graphs_seen=0, - num_piecewise_capturable_graphs_seen=0, - num_backend_compilations=0, - num_cudagraph_captured=0, - ), set_forward_context({}, vllm_config=vllm_config): - mod_B(torch.randn(BATCH_SIZE, MLP_SIZE).cuda()) - mod_B(torch.randn(2, MLP_SIZE).cuda()) - mod_B(torch.randn(1, MLP_SIZE).cuda()) - - with set_current_vllm_config(vllm_config): - mod_C = C(vllm_config=vllm_config, prefix='').eval().cuda() - - # C's support_torch_compile should override B's ignore_torch_compile - with compilation_counter.expect( - num_graphs_seen=1, - num_piecewise_graphs_seen=3, - num_piecewise_capturable_graphs_seen=2, - num_backend_compilations=2, - num_cudagraph_captured=4, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen - ), set_forward_context({}, vllm_config=vllm_config): - mod_C(torch.randn(BATCH_SIZE, MLP_SIZE).cuda()) - mod_C(torch.randn(2, MLP_SIZE).cuda()) - mod_C(torch.randn(1, MLP_SIZE).cuda()) - - @torch.inference_mode -def run_model(vllm_config, model: nn.Module, inputs: torch.Tensor): +def run_model(vllm_config: VllmConfig, model: nn.Module, inputs: torch.Tensor, + cudagraph_runtime_mode: CUDAGraphMode): with set_forward_context({}, vllm_config=vllm_config): - # First run is for compile + # warmup for the model with cudagraph_mode NONE model(inputs) - # Run CUDAGraph captured sizes - model(inputs[:2]) - model(inputs[:1]) + # simulate cudagraphs capturing + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=2, )): + model(inputs[:2]) + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=1, )): + model(inputs[:1]) - output = model(inputs[:2]) + # simulate cudagraphs replay + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=2, )): + output = model(inputs[:2]) output = output.cpu() return output.cpu() @@ -277,6 +206,7 @@ def test_multi_graph_piecewise_compile_outputs_equal(): splitting_ops=["silly.attention"], cudagraph_capture_sizes=[1, 2], )) + cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE with set_current_vllm_config(vllm_config): model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE, @@ -299,11 +229,13 @@ def test_multi_graph_piecewise_compile_outputs_equal(): num_cudagraph_captured=8, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen ): - outputs.append(run_model(vllm_config, model, inputs)) + outputs.append( + run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) # no compile or cudagraph vllm_config = VllmConfig(compilation_config=CompilationConfig( level=CompilationLevel.NO_COMPILATION, )) + cudagraph_runtime_mode = CUDAGraphMode.NONE with set_current_vllm_config(vllm_config): model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE, @@ -318,7 +250,8 @@ def test_multi_graph_piecewise_compile_outputs_equal(): num_backend_compilations=0, num_cudagraph_captured=0, ): - outputs.append(run_model(vllm_config, model, inputs)) + outputs.append( + run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) # piecewise compile without CUDA graph vllm_config = VllmConfig(compilation_config=CompilationConfig( @@ -326,6 +259,7 @@ def test_multi_graph_piecewise_compile_outputs_equal(): use_cudagraph=False, splitting_ops=["silly.attention"], )) + cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE with set_current_vllm_config(vllm_config): model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE, @@ -340,7 +274,8 @@ def test_multi_graph_piecewise_compile_outputs_equal(): num_backend_compilations=4, num_cudagraph_captured=0, # no cudagraph captured ): - outputs.append(run_model(vllm_config, model, inputs)) + outputs.append( + run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) # Generally don't expect outputs with and without inductor # to be bitwise equivalent diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py new file mode 100644 index 0000000000..51f8ddd566 --- /dev/null +++ b/tests/compile/test_decorator.py @@ -0,0 +1,251 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch +from torch import nn +from torch.library import Library + +from vllm.compilation.counter import compilation_counter +from vllm.compilation.decorators import (ignore_torch_compile, + support_torch_compile) +from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel, + CUDAGraphMode, VllmConfig, set_current_vllm_config) +from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.utils import direct_register_custom_op + +# create a library to hold the custom op +silly_lib = Library("silly", "FRAGMENT") # noqa + +BATCH_SIZE = 32 +MLP_SIZE = 128 + + +def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + out.copy_(q) + out += k + out += v + + +def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + return + + +direct_register_custom_op( + op_name="attention", + op_func=silly_attention, + mutates_args=["out"], + fake_impl=silly_attention_fake, + target_lib=silly_lib, +) + + +@torch.inference_mode +def run_model(vllm_config: VllmConfig, model: nn.Module, + cudagraph_runtime_mode: CUDAGraphMode): + with set_forward_context({}, vllm_config=vllm_config): + # warmup for the model with cudagraph_mode NONE + model(torch.randn(BATCH_SIZE, MLP_SIZE).cuda()) + + # simulate cudagraphs capturing + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=2, )): + model(torch.randn(2, MLP_SIZE).cuda()) + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=1, )): + model(torch.randn(1, MLP_SIZE).cuda()) + + # simulate cudagraphs replay + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=2, )): + output = model(torch.randn(2, MLP_SIZE).cuda()) + + output = output.cpu() + return output.cpu() + + +def test_ignore_torch_compile_decorator(): + # piecewise + vllm_config = VllmConfig(compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=True, + splitting_ops=["silly.attention"], + cudagraph_capture_sizes=[1, 2], + )) + cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE + + @support_torch_compile + class A(nn.Module): + + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = '', + **kwargs) -> None: + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + x + attn_output = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, attn_output) + x = attn_output + x = x * 3 + return x + + @ignore_torch_compile + class B(A): + ... + + @support_torch_compile + class C(B): + ... + + with set_current_vllm_config(vllm_config): + mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda() + + # A has support_torch_compile + with compilation_counter.expect( + num_graphs_seen=1, + num_piecewise_graphs_seen=3, + num_piecewise_capturable_graphs_seen=2, + num_backend_compilations=2, + num_cudagraph_captured=4, + # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + run_model(vllm_config, mod_A, cudagraph_runtime_mode) + + with set_current_vllm_config(vllm_config): + mod_B = B(vllm_config=vllm_config, prefix='').eval().cuda() + + # B's ignore_torch_compile should override A's support_torch_compile + with compilation_counter.expect( + num_graphs_seen=0, + num_piecewise_graphs_seen=0, + num_piecewise_capturable_graphs_seen=0, + num_backend_compilations=0, + num_cudagraph_captured=0, + ): + run_model(vllm_config, mod_B, cudagraph_runtime_mode) + + with set_current_vllm_config(vllm_config): + mod_C = C(vllm_config=vllm_config, prefix='').eval().cuda() + + # C's support_torch_compile should override B's ignore_torch_compile + with compilation_counter.expect( + num_graphs_seen=1, + num_piecewise_graphs_seen=3, + num_piecewise_capturable_graphs_seen=2, + num_backend_compilations=2, + num_cudagraph_captured=4, + # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + run_model(vllm_config, mod_C, cudagraph_runtime_mode) + + +# Only enable torch.compile if +# vllm_config.cache_config.kv_sharing_fast_prefill=True +@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config. + kv_sharing_fast_prefill) +class B(nn.Module): + + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = '', + **kwargs) -> None: + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + x + attn_output = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, attn_output) + x = attn_output + x = x + x + return x + + +# Only enable torch.compile if +# vllm_config.cache_config.kv_sharing_fast_prefill=False +@support_torch_compile(enable_if=lambda vllm_config: not vllm_config. + cache_config.kv_sharing_fast_prefill) +class A(nn.Module): + + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = '', + **kwargs) -> None: + super().__init__() + self.mod1 = B(vllm_config=vllm_config, prefix=prefix, **kwargs) + self.mod2 = B(vllm_config=vllm_config, prefix=prefix, **kwargs) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.mod1(x) + attn_output = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, attn_output) + x = attn_output + x = self.mod2(x) + return x + + +def test_conditional_compile_enable_if(): + vllm_config = VllmConfig(cache_config=CacheConfig( + kv_sharing_fast_prefill=True, ), + compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=True, + splitting_ops=["silly.attention"], + cudagraph_capture_sizes=[1, 2], + )) + cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE + + with set_current_vllm_config(vllm_config): + mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda() + + # A has support_torch_compile but enable_if fn returns False + # enalbe_if will be True for B, so we expect mod1 and mod2 + # to be compiled + with compilation_counter.expect( + num_graphs_seen=2, + num_piecewise_graphs_seen=6, + # 3 piecewise graphs per instance of B() + num_piecewise_capturable_graphs_seen=4, + num_backend_compilations=4, + num_cudagraph_captured=8, + # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + run_model(vllm_config, mod_A, cudagraph_runtime_mode) + + # Set kv_sharing_fast_prefill=False + # which will cause A to be compiled and B to not be compiled + vllm_config = VllmConfig(cache_config=CacheConfig( + kv_sharing_fast_prefill=False, ), + compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=True, + splitting_ops=["silly.attention"], + cudagraph_capture_sizes=[1, 2], + )) + + with set_current_vllm_config(vllm_config): + mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda() + + with compilation_counter.expect( + num_graphs_seen=1, + num_piecewise_graphs_seen=7, + # 3 attn ops and 4 non-attn ops + num_piecewise_capturable_graphs_seen=4, + num_backend_compilations=4, + num_cudagraph_captured=8, + # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + run_model(vllm_config, mod_A, cudagraph_runtime_mode) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 58f70ef9ef..41d9fcb824 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -52,6 +52,14 @@ def _should_ignore_torch_compile(cls) -> bool: return getattr(cls, IGNORE_COMPILE_KEY, False) +@overload +def support_torch_compile( + *, + enable_if: Optional[Callable[[VllmConfig], bool]] = None, +) -> Callable[[_T], _T]: + ... + + @overload def support_torch_compile( *, @@ -69,6 +77,7 @@ def support_torch_compile( cls: Optional[_T] = None, *, dynamic_arg_dims: Optional[dict[str, Union[int, list[int]]]] = None, + enable_if: Optional[Callable[[VllmConfig], bool]] = None, ) -> Union[Callable[[_T], _T], _T]: """ A decorator to add support for compiling the forward method of a class. @@ -118,6 +127,11 @@ def support_torch_compile( NOTE: if an argument is `None`, it should always be passed as `None` during the lifetime of the model, otherwise, it cannot be captured as a single computation graph. + + `enable_if` is a function that takes a `VllmConfig` object as input and + returns a boolean value indicating whether to compile the model or not. + This is useful if you want to compile the model only when certain + conditions are met. """ def cls_decorator_helper(cls: _T) -> _T: @@ -149,7 +163,8 @@ def support_torch_compile( if k not in sig.parameters: raise ValueError( f"Argument {k} not found in the forward method of {cls}") - return _support_torch_compile(cls, inferred_dynamic_arg_dims) + return _support_torch_compile(cls, inferred_dynamic_arg_dims, + enable_if) if cls is not None: # use `support_torch_compile` as a decorator without arguments @@ -162,6 +177,7 @@ def support_torch_compile( def _support_torch_compile( cls: _T, dynamic_arg_dims: dict[str, Union[int, list[int]]], + enable_if: Optional[Callable[[VllmConfig], bool]] = None, ) -> _T: """ A decorator to add support for compiling the forward method of a class. @@ -182,13 +198,14 @@ def _support_torch_compile( def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs): old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) self.vllm_config = vllm_config + enable_compile = enable_if is None or enable_if(vllm_config) # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner # will handle the compilation, so we don't need to do anything here. self.do_not_compile = \ vllm_config.compilation_config.level in [ CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS ] or not supports_dynamo() or _should_ignore_torch_compile( - self.__class__) + self.__class__) or not enable_compile if self.do_not_compile: return From c4477f55e581e5ef5f52bbe39cba6e0de1956444 Mon Sep 17 00:00:00 2001 From: Benji Beck Date: Wed, 20 Aug 2025 10:37:29 -0700 Subject: [PATCH 436/932] Migrate Mistral3ImagePixelInputs to TensorSchema (#21945) Signed-off-by: Benji Beck Co-authored-by: Cyrus Leung --- vllm/model_executor/models/mistral3.py | 38 ++++++++++++-------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index a647292d3a..438513433d 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -3,7 +3,7 @@ from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, +from typing import (Annotated, Final, Literal, Optional, Protocol, TypeVar, Union) import torch @@ -32,6 +32,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) @@ -42,15 +43,23 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, from .vision import get_vision_encoder_info -class Mistral3ImagePixelInputs(TypedDict): - type: Literal["pixel_values_pixtral"] - pixel_values: Union[torch.Tensor, list[torch.Tensor]] +class Mistral3ImagePixelInputs(TensorSchema): + """ + Dimensions: + - bn: Batch size * number of images + - c: Number of channels (3) + - h: Height of each image + - w: Width of each image """ - Shape: `(batch_size * num_images, num_channels, height, width)` - Note that `height` or `width` may be different per batch and image, - in which case the data is passed as a list instead of a batched tensor. - """ + type: Literal["pixel_values_pixtral"] = "pixel_values_pixtral" + + # Note that `height` or `width` may be different per batch and image, + # in which case the data is passed as a list instead of a batched tensor. + pixel_values: Annotated[ + Union[torch.Tensor, list[torch.Tensor]], + TensorShape("bn", 3, "h", "w", dynamic_dims={"h", "w"}), + ] class Mistral3PatchMerger(nn.Module): @@ -456,19 +465,6 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: - h = w = self.config.vision_config.image_size - expected_dims = (3, h, w) - actual_dims = tuple(data.shape[1:]) - - if actual_dims != expected_dims: - expected_expr = ("batch_size", *map(str, expected_dims)) - raise ValueError( - f"The expected shape of pixel values is {expected_expr}. " - f"You supplied {tuple(data.shape)}.") - - return data - def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[Mistral3ImagePixelInputs]: pixel_values = kwargs.pop("pixel_values", None) From f77a0802b758a32c5b9f7bc04e9498d77e8d99e0 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 20 Aug 2025 13:57:37 -0400 Subject: [PATCH 437/932] Limit HTTP header count and size (#23267) Signed-off-by: Taneem Ibrahim Signed-off-by: Russell Bryant Co-authored-by: Taneem Ibrahim --- vllm/entrypoints/constants.py | 10 ++++++++++ vllm/entrypoints/launcher.py | 21 +++++++++++++++++++++ vllm/entrypoints/openai/api_server.py | 2 ++ vllm/entrypoints/openai/cli_args.py | 8 ++++++++ 4 files changed, 41 insertions(+) create mode 100644 vllm/entrypoints/constants.py diff --git a/vllm/entrypoints/constants.py b/vllm/entrypoints/constants.py new file mode 100644 index 0000000000..b5bcccc35d --- /dev/null +++ b/vllm/entrypoints/constants.py @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Shared constants for vLLM entrypoints. +""" + +# HTTP header limits for h11 parser +# These constants help mitigate header abuse attacks +H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304 # 4 MB +H11_MAX_HEADER_COUNT_DEFAULT = 256 diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index 9f4dc19fb4..4e852ba594 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -14,6 +14,8 @@ from vllm import envs from vllm.engine.async_llm_engine import AsyncEngineDeadError from vllm.engine.multiprocessing import MQEngineDeadError from vllm.engine.protocol import EngineClient +from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT, + H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT) from vllm.entrypoints.ssl import SSLCertRefresher from vllm.logger import init_logger from vllm.utils import find_process_using_port @@ -26,6 +28,11 @@ async def serve_http(app: FastAPI, sock: Optional[socket.socket], enable_ssl_refresh: bool = False, **uvicorn_kwargs: Any): + """ + Start a FastAPI app using Uvicorn, with support for custom Uvicorn config + options. Supports http header limits via h11_max_incomplete_event_size and + h11_max_header_count. + """ logger.info("Available routes are:") for route in app.routes: methods = getattr(route, "methods", None) @@ -36,7 +43,21 @@ async def serve_http(app: FastAPI, logger.info("Route: %s, Methods: %s", path, ', '.join(methods)) + # Extract header limit options if present + h11_max_incomplete_event_size = uvicorn_kwargs.pop( + "h11_max_incomplete_event_size", None) + h11_max_header_count = uvicorn_kwargs.pop("h11_max_header_count", None) + + # Set safe defaults if not provided + if h11_max_incomplete_event_size is None: + h11_max_incomplete_event_size = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT + if h11_max_header_count is None: + h11_max_header_count = H11_MAX_HEADER_COUNT_DEFAULT + config = uvicorn.Config(app, **uvicorn_kwargs) + # Set header limits + config.h11_max_incomplete_event_size = h11_max_incomplete_event_size + config.h11_max_header_count = h11_max_header_count config.load() server = uvicorn.Server(config) _add_shutdown_handlers(app, server) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 765327da3b..24148bcef2 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1922,6 +1922,8 @@ async def run_server_worker(listen_address, ssl_certfile=args.ssl_certfile, ssl_ca_certs=args.ssl_ca_certs, ssl_cert_reqs=args.ssl_cert_reqs, + h11_max_incomplete_event_size=args.h11_max_incomplete_event_size, + h11_max_header_count=args.h11_max_header_count, **uvicorn_kwargs, ) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index e15f65b430..6e4eff5c80 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -20,6 +20,8 @@ from vllm.config import config from vllm.engine.arg_utils import AsyncEngineArgs, optional_type from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, validate_chat_template) +from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT, + H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT) from vllm.entrypoints.openai.serving_models import LoRAModulePath from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.logger import init_logger @@ -172,6 +174,12 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" enable_log_outputs: bool = False """If set to True, enable logging of model outputs (generations) in addition to the input logging that is enabled by default.""" + h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT + """Maximum size (bytes) of an incomplete HTTP event (header or body) for + h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB).""" + h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT + """Maximum number of HTTP headers allowed in a request for h11 parser. + Helps mitigate header abuse. Default: 256.""" @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: From ebe56a0064f7a72a5c51d4cd6bcca165590c5bed Mon Sep 17 00:00:00 2001 From: dongluw <108290936+dongluw@users.noreply.github.com> Date: Wed, 20 Aug 2025 14:15:18 -0400 Subject: [PATCH 438/932] Small fix for Command-A-Vision (#23268) Signed-off-by: donglu --- vllm/model_executor/models/cohere2_vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index 4682a8a428..fca1aee835 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -348,7 +348,7 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, vllm_config=vllm_config, hf_config=config.text_config, prefix=maybe_prefix(prefix, "language_model"), - architectures=["Cohere2ForCausalLM"]) + architectures=config.text_config.architectures) @property def dtype(self): From 0cdbf5e61ce3fd97d33b31b775d2faaadc99fbc5 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 20 Aug 2025 15:13:36 -0400 Subject: [PATCH 439/932] [Kernel/Quant] Remove the original marlin format and qqq (#23204) Signed-off-by: mgoin --- .../configs/Meta-Llama-3-8B-QQQ.yaml | 12 - .../lm-eval-harness/configs/models-large.txt | 1 - CMakeLists.txt | 2 - benchmarks/kernels/benchmark_machete.py | 23 +- csrc/quantization/machete/generate.py | 139 +- csrc/quantization/marlin/dense/LICENSE | 209 --- csrc/quantization/marlin/dense/common/base.h | 32 - csrc/quantization/marlin/dense/common/mem.h | 89 -- .../marlin/dense/marlin_cuda_kernel.cu | 1073 -------------- .../marlin/qqq/marlin_qqq_gemm_kernel.cu | 1248 ----------------- csrc/torch_bindings.cpp | 17 - tests/compile/test_full_graph.py | 6 - tests/kernels/quantization/test_machete_mm.py | 34 +- .../kernels/quantization/test_marlin_gemm.py | 83 -- tests/quantization/test_configs.py | 10 - tests/quantization/test_lm_head.py | 6 +- tests/weight_loading/models.txt | 4 - vllm/_custom_ops.py | 36 - vllm/config/__init__.py | 7 +- vllm/lora/layers.py | 3 - vllm/model_executor/layers/linear.py | 1 - .../layers/quantization/__init__.py | 6 - .../layers/quantization/marlin.py | 263 ---- .../model_executor/layers/quantization/qqq.py | 275 ---- .../utils/marlin_utils_test_qqq.py | 126 -- .../layers/quantization/utils/quant_utils.py | 85 -- 26 files changed, 92 insertions(+), 3698 deletions(-) delete mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml delete mode 100644 csrc/quantization/marlin/dense/LICENSE delete mode 100644 csrc/quantization/marlin/dense/common/base.h delete mode 100644 csrc/quantization/marlin/dense/common/mem.h delete mode 100644 csrc/quantization/marlin/dense/marlin_cuda_kernel.cu delete mode 100644 csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu delete mode 100644 vllm/model_executor/layers/quantization/marlin.py delete mode 100644 vllm/model_executor/layers/quantization/qqq.py delete mode 100644 vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml deleted file mode 100644 index 56ec933c9c..0000000000 --- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# For vllm script, with -t option (tensor parallel size). -# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1 -model_name: "HandH1998/QQQ-Llama-3-8b-g128" -tasks: -- name: "gsm8k" - metrics: - - name: "exact_match,strict-match" - value: 0.419 - - name: "exact_match,flexible-extract" - value: 0.416 -limit: 1000 -num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt index 27a1a9a82b..37eeac85c9 100644 --- a/.buildkite/lm-eval-harness/configs/models-large.txt +++ b/.buildkite/lm-eval-harness/configs/models-large.txt @@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml Mixtral-8x7B-Instruct-v0.1.yaml Qwen2-57B-A14-Instruct.yaml DeepSeek-V2-Lite-Chat.yaml -Meta-Llama-3-8B-QQQ.yaml diff --git a/CMakeLists.txt b/CMakeLists.txt index bcbd1b52a0..a1deefb07f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -357,9 +357,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC}) set(MARLIN_SRCS - "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" - "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu" "csrc/quantization/gptq_marlin/gptq_marlin.cu" "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" "csrc/quantization/gptq_marlin/awq_marlin_repack.cu") diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index 975d10f2e9..a9c4d30d9b 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -253,28 +253,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable: else: assert bt.a.dtype == torch.int8 assert bt.wtype == scalar_types.uint4b8 - - if bt.w_ch_s is not None: - s_ch = bt.w_ch_s.to(torch.float32) - else: - s_ch = torch.ones(bt.w_ref.shape[1], dtype=torch.float32, device=device) - - if bt.w_tok_s is not None: - s_tok = bt.w_tok_s.to(torch.float32) - else: - s_tok = torch.ones(bt.a.shape[0], dtype=torch.float32, device=device) - - fn = lambda: ops.marlin_qqq_gemm( - a=bt.a, - b_q_weight=w_q, - s_group=w_s, - s_tok=s_tok, - s_ch=s_ch, - workspace=workspace.scratch, - size_m=bt.a.shape[0], - size_n=bt.w_ref.shape[1], - size_k=bt.w_ref.shape[0], - ) + raise NotImplementedError("QQQ is not supported anymore") return fn diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index 88b3f9c734..0d14ba1593 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -571,78 +571,79 @@ def generate(): itertools.repeat(default_heuristic)) ] - # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk)) - # TODO (LucasWilkinson): Further tuning required - qqq_tile_heuristic_config = { - #### M = 257+ - # ((128, 256), (2, 1, 1)) Broken for QQQ types - # TODO (LucasWilkinson): Investigate further - # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)), - # "M > 256": ((128, 256), (2, 1, 1)), - "M > 256": ((128, 128), (2, 1, 1)), - #### M = 129-256 - "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)), - "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)), - # ((128, 256), (2, 1, 1)) Broken for QQQ types - # TODO (LucasWilkinson): Investigate further - # "M > 128": ((128, 256), (2, 1, 1)), - "M > 128": ((128, 128), (2, 1, 1)), - #### M = 65-128 - "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)), - "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)), - "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)), - "M > 64": ((128, 128), (2, 1, 1)), - #### M = 33-64 - "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)), - # Broken for QQQ types - # TODO (LucasWilkinson): Investigate further - #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)), - "M > 32": ((128, 64), (2, 1, 1)), - #### M = 17-32 - "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)), - "M > 16": ((256, 32), (2, 1, 1)), - #### M = 1-16 - "N >= 26624": ((256, 16), (1, 1, 1)), - None: ((128, 16), (1, 1, 1)), - } + # TODO: Support W4A8 when ready + # # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk)) + # # TODO (LucasWilkinson): Further tuning required + # qqq_tile_heuristic_config = { + # #### M = 257+ + # # ((128, 256), (2, 1, 1)) Broken for QQQ types + # # TODO (LucasWilkinson): Investigate further + # # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)), + # # "M > 256": ((128, 256), (2, 1, 1)), + # "M > 256": ((128, 128), (2, 1, 1)), + # #### M = 129-256 + # "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)), + # "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)), + # # ((128, 256), (2, 1, 1)) Broken for QQQ types + # # TODO (LucasWilkinson): Investigate further + # # "M > 128": ((128, 256), (2, 1, 1)), + # "M > 128": ((128, 128), (2, 1, 1)), + # #### M = 65-128 + # "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)), + # "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)), + # "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)), + # "M > 64": ((128, 128), (2, 1, 1)), + # #### M = 33-64 + # "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)), + # # Broken for QQQ types + # # TODO (LucasWilkinson): Investigate further + # #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)), + # "M > 32": ((128, 64), (2, 1, 1)), + # #### M = 17-32 + # "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)), + # "M > 16": ((256, 32), (2, 1, 1)), + # #### M = 1-16 + # "N >= 26624": ((256, 16), (1, 1, 1)), + # None: ((128, 16), (1, 1, 1)), + # } - # For now we use the same heuristic for all types - # Heuristic is currently tuned for H100s - qqq_heuristic = [ - (cond, ScheduleConfig(*tile_config, - **sch_common_params)) # type: ignore - for cond, tile_config in qqq_tile_heuristic_config.items() - ] + # # For now we use the same heuristic for all types + # # Heuristic is currently tuned for H100s + # qqq_heuristic = [ + # (cond, ScheduleConfig(*tile_config, + # **sch_common_params)) # type: ignore + # for cond, tile_config in qqq_tile_heuristic_config.items() + # ] - QQQ_kernel_types = [ - *(TypeConfig( - a=DataType.s8, - b=VLLMDataType.u4b8, - b_group_scale=b_group_scale, - b_group_zeropoint=DataType.void, - b_channel_scale=DataType.f32, - a_token_scale=DataType.f32, - out=DataType.f16, - accumulator=DataType.s32, - ) for b_group_scale in (DataType.f16, DataType.void)), - *(TypeConfig( - a=DataType.e4m3, - b=VLLMDataType.u4b8, - b_group_scale=b_group_scale, - b_group_zeropoint=DataType.void, - b_channel_scale=DataType.f32, - a_token_scale=DataType.f32, - out=DataType.f16, - accumulator=DataType.f32, - ) for b_group_scale in (DataType.f16, DataType.void)), - ] + # QQQ_kernel_types = [ + # *(TypeConfig( + # a=DataType.s8, + # b=VLLMDataType.u4b8, + # b_group_scale=b_group_scale, + # b_group_zeropoint=DataType.void, + # b_channel_scale=DataType.f32, + # a_token_scale=DataType.f32, + # out=DataType.f16, + # accumulator=DataType.s32, + # ) for b_group_scale in (DataType.f16, DataType.void)), + # *(TypeConfig( + # a=DataType.e4m3, + # b=VLLMDataType.u4b8, + # b_group_scale=b_group_scale, + # b_group_zeropoint=DataType.void, + # b_channel_scale=DataType.f32, + # a_token_scale=DataType.f32, + # out=DataType.f16, + # accumulator=DataType.f32, + # ) for b_group_scale in (DataType.f16, DataType.void)), + # ] - impl_configs += [ - ImplConfig(x[0], x[1], x[2]) - for x in zip(QQQ_kernel_types, - itertools.repeat(get_unique_schedules(qqq_heuristic)), - itertools.repeat(qqq_heuristic)) - ] + # impl_configs += [ + # ImplConfig(x[0], x[1], x[2]) + # for x in zip(QQQ_kernel_types, + # itertools.repeat(get_unique_schedules(qqq_heuristic)), + # itertools.repeat(qqq_heuristic)) + # ] output_dir = os.path.join(SCRIPT_DIR, "generated") diff --git a/csrc/quantization/marlin/dense/LICENSE b/csrc/quantization/marlin/dense/LICENSE deleted file mode 100644 index 1d1e4cf9c8..0000000000 --- a/csrc/quantization/marlin/dense/LICENSE +++ /dev/null @@ -1,209 +0,0 @@ -Contains code from https://github.com/IST-DASLab/marlin - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "{}" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright {yyyy} {name of copyright owner} - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ------------------------------------------------------------------------------------- - -This product bundles various third-party components under other open source licenses. -This section summarizes those components and their licenses. See licenses/ -for text of these licenses. diff --git a/csrc/quantization/marlin/dense/common/base.h b/csrc/quantization/marlin/dense/common/base.h deleted file mode 100644 index 68c83d5478..0000000000 --- a/csrc/quantization/marlin/dense/common/base.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Modified by HandH1998 - * Modified by Neural Magic - * Copyright (C) Marlin.2024 Elias Frantar - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; } - -// Instances of `Vec` are used to organize groups of >>registers<<, as needed -// for instance as inputs to tensor core operations. Consequently, all -// corresponding index accesses must be compile-time constants, which is why we -// extensively use `#pragma unroll` throughout the kernel code to guarantee -// this. -template -struct Vec { - T elems[n]; - __device__ T& operator[](int i) { return elems[i]; } -}; diff --git a/csrc/quantization/marlin/dense/common/mem.h b/csrc/quantization/marlin/dense/common/mem.h deleted file mode 100644 index 64f9c393d7..0000000000 --- a/csrc/quantization/marlin/dense/common/mem.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Modified by HandH1998 - * Modified by Neural Magic - * Copyright (C) Marlin.2024 Elias Frantar - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -// Predicated asynchronous global->shared copy; used for inputs A where we apply -// predication to handle batchsizes that are not multiples of 16. -__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, - bool pred = true) { - const int BYTES = 16; - uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile( - "{\n" - " .reg .pred p;\n" - " setp.ne.b32 p, %0, 0;\n" - " @p cp.async.cg.shared.global [%1], [%2], %3;\n" - "}\n" ::"r"((int)pred), - "r"(smem), "l"(glob_ptr), "n"(BYTES)); -} - -// Asynchronous global->shared copy -__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) { - const int BYTES = 16; - uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile( - "{\n" - " cp.async.cg.shared.global [%0], [%1], %2;\n" - "}\n" ::"r"(smem), - "l"(glob_ptr), "n"(BYTES)); -} - -// Async copy fence. -__device__ inline void cp_async_fence() { - asm volatile("cp.async.commit_group;\n" ::); -} - -// Wait until at most `n` async copy stages are still pending. -template -__device__ inline void cp_async_wait() { - asm volatile("cp.async.wait_group %0;\n" ::"n"(n)); -} - -// Wait until barrier reaches `count`, then lock for current threadblock. -__device__ inline void barrier_acquire(int* lock, int count) { - if (threadIdx.x == 0) { - int state = -1; - do - // Guarantee that subsequent writes by this threadblock will be visible - // globally. - asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" - : "=r"(state) - : "l"(lock)); - while (state != count); - } - __syncthreads(); -} - -// Release barrier and increment visitation count. -__device__ inline void barrier_release(int* lock, bool reset = false) { - __syncthreads(); - if (threadIdx.x == 0) { - if (reset) { - lock[0] = 0; - return; - } - int val = 1; - // Make sure that all writes since acquiring this barrier are visible - // globally, while releasing the barrier. - asm volatile("fence.acq_rel.gpu;\n"); - asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n" - : - : "l"(lock), "r"(val)); - } -} diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu deleted file mode 100644 index ea96326ed7..0000000000 --- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu +++ /dev/null @@ -1,1073 +0,0 @@ -/* - * Modified by Neural Magic - * Copyright (C) Marlin.2024 Elias Frantar - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include - -#include - -#include "common/base.h" -#include "core/registration.h" - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - #include "common/mem.h" -#endif - -template -inline std::string str(T x) { - return std::to_string(x); -} - -namespace marlin_dense { - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - -using I4 = Vec; -// Matrix fragments for tensor core instructions; their precise layout is -// documented here: -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type -using FragA = Vec; -using FragB = Vec; -using FragC = Vec; -using FragS = Vec; // quantization scales - -// m16n8k16 tensor core mma instruction with fp16 inputs and fp32 -// output/accumulation. -__device__ inline void mma(const FragA& a_frag, const FragB& frag_b, - FragC& frag_c) { - const uint32_t* a = reinterpret_cast(&a_frag); - const uint32_t* b = reinterpret_cast(&frag_b); - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), - "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); -} - -// Instruction for loading a full 16x16 matrix fragment of operand A from shared -// memory, directly in tensor core layout. -__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) { - uint32_t* a = reinterpret_cast(&frag_a); - uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n" - : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3]) - : "r"(smem)); -} - -// Lookup-table based 3-input logical operation; explicitly used for -// dequantization as the compiler does not seem to automatically recognize it in -// all cases. -template -__device__ inline int lop3(int a, int b, int c) { - int res; - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" - : "=r"(res) - : "r"(a), "r"(b), "r"(c), "n"(lut)); - return res; -} - -// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16 -// values. We mostly follow the strategy in the link below, with some small -// changes: -// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h -__device__ inline FragB dequant(int q) { - const int LO = 0x000f000f; - const int HI = 0x00f000f0; - const int EX = 0x64006400; - // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); - // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point - // directly into `SUB` and `ADD`. - const int SUB = 0x64086408; - const int MUL = 0x2c002c00; - const int ADD = 0xd480d480; - FragB frag_b; - frag_b[0] = __hsub2(*reinterpret_cast(&lo), - *reinterpret_cast(&SUB)); - frag_b[1] = __hfma2(*reinterpret_cast(&hi), - *reinterpret_cast(&MUL), - *reinterpret_cast(&ADD)); - return frag_b; -} - -// Multiply dequantized values by the corresponding quantization scale; used -// only for grouped quantization. -__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) { - half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]); - frag_b[0] = __hmul2(frag_b[0], s); - frag_b[1] = __hmul2(frag_b[1], s); -} - -template shared - // fetch pipeline - const int group_blocks = -1 // number of consecutive 16x16 blocks - // with a separate quantization scale - > -__global__ void Marlin( - const int4* __restrict__ A, // fp16 input matrix of shape mxk - const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn - int4* __restrict__ C, // fp16 output buffer of shape mxn - const int4* __restrict__ s, // fp16 quantization scales of shape - // (k/groupsize)xn - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int* locks // extra global storage for barrier synchronization -) { - // Each threadblock processes one "stripe" of the B matrix with (roughly) the - // same size, which might involve multiple column "slices" (of width 16 * - // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM - // example: - // 0 1 3 - // 0 2 3 - // 1 2 4 - // While this kind of partitioning makes things somewhat more complicated, it - // ensures good utilization of all SMs for many kinds of shape and GPU - // configurations, while requiring as few slow global cross-threadblock - // reductions as possible. - - // For larger GEMMs we run multiple batchsize 64 versions in parallel for a - // better partitioning with less reductions - int parallel = 1; - if (prob_m > 16 * thread_m_blocks) { - parallel = prob_m / (16 * thread_m_blocks); - prob_m = 16 * thread_m_blocks; - } - - int k_tiles = prob_k / 16 / thread_k_blocks; - int n_tiles = prob_n / 16 / thread_n_blocks; - int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x); - // Ensure that the number of tiles in each stripe is a multiple of the - // groupsize; this avoids an annoying special case where a stripe starts in - // the middle of group. - if (group_blocks != -1) - iters = (group_blocks / thread_k_blocks) * - ceildiv(iters, (group_blocks / thread_k_blocks)); - - int slice_row = (iters * blockIdx.x) % k_tiles; - int slice_col_par = (iters * blockIdx.x) / k_tiles; - int slice_col = slice_col_par; - int slice_iters; // number of threadblock tiles in the current slice - int slice_count = - 0; // total number of active threadblocks in the current slice - int slice_idx; // index of threadblock in current slice; numbered bottom to - // top - - // We can easily implement parallel problem execution by just remapping - // indices and advancing global pointers - if (slice_col_par >= n_tiles) { - A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8; - C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8; - locks += (slice_col_par / n_tiles) * n_tiles; - slice_col = slice_col_par % n_tiles; - } - - // Compute all information about the current slice which is required for - // synchronization. - auto init_slice = [&]() { - slice_iters = - iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row); - if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0; - if (slice_iters == 0) return; - if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row; - slice_count = 1; - slice_idx = 0; - int col_first = iters * ceildiv(k_tiles * slice_col_par, iters); - if (col_first <= k_tiles * (slice_col_par + 1)) { - int col_off = col_first - k_tiles * slice_col_par; - slice_count = ceildiv(k_tiles - col_off, iters); - if (col_off > 0) slice_count++; - int delta_first = iters * blockIdx.x - col_first; - if (delta_first < 0 || (col_off == 0 && delta_first == 0)) - slice_idx = slice_count - 1; - else { - slice_idx = slice_count - 1 - delta_first / iters; - if (col_off > 0) slice_idx--; - } - } - if (slice_col == n_tiles) { - A += 16 * thread_m_blocks * prob_k / 8; - C += 16 * thread_m_blocks * prob_n / 8; - locks += n_tiles; - slice_col = 0; - } - }; - init_slice(); - - int a_gl_stride = prob_k / 8; // stride of the A matrix in global memory - // We typically use `constexpr` to indicate that this value is a compile-time - // constant - constexpr int a_sh_stride = - 16 * thread_k_blocks / 8; // stride of an A matrix tile in shared memory - constexpr int a_gl_rd_delta_o = - 16 * thread_k_blocks / - 8; // delta between subsequent A tiles in global memory - int a_gl_rd_delta_i = - a_gl_stride * - (threads / a_gl_rd_delta_o); // between subsequent accesses within a tile - constexpr int a_sh_wr_delta = - a_sh_stride * - (threads / a_gl_rd_delta_o); // between shared memory writes - constexpr int a_sh_rd_delta_o = - 2 * ((threads / 32) / - (thread_n_blocks / 4)); // between shared memory tile reads - constexpr int a_sh_rd_delta_i = - a_sh_stride * 16; // within a shared memory tile - constexpr int a_sh_stage = - a_sh_stride * (16 * thread_m_blocks); // overall size of a tile - constexpr int a_sh_wr_iters = - ceildiv(a_sh_stage, - a_sh_wr_delta); // number of shared write iterations for a tile - - int b_gl_stride = 16 * prob_n / 32; - constexpr int b_sh_stride = 32 * thread_n_blocks / 4; - int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks; - int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride); - constexpr int b_sh_wr_delta = threads; - constexpr int b_sh_rd_delta = threads; - constexpr int b_sh_stage = b_sh_stride * thread_k_blocks; - constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta; - - int s_gl_stride = prob_n / 8; - constexpr int s_sh_stride = 16 * thread_n_blocks / 8; - constexpr int s_sh_stage = s_sh_stride; - int s_gl_rd_delta = s_gl_stride; - - // Global A read index of current thread. - int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - a_gl_rd += a_gl_rd_delta_o * slice_row; - // Shared write index of current thread. - int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - // Shared read index. - int a_sh_rd = - a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16; - a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4)); - - int b_gl_rd = - b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride); - b_gl_rd += b_sh_stride * slice_col; - b_gl_rd += b_gl_rd_delta_o * slice_row; - auto b_sh_wr = threadIdx.x; - auto b_sh_rd = threadIdx.x; - - int s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) + - s_sh_stride * slice_col + threadIdx.x; - auto s_sh_wr = threadIdx.x; - int s_sh_rd; - // We use a different scale layout for grouped and column-wise quantization as - // we scale a `half2` tile in column-major layout in the former and in - // row-major in the latter case. - if (group_blocks != -1) - s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - (threadIdx.x % 32) / 4; - else - s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - (threadIdx.x % 32) % 4; - - // Precompute which thread should not read memory in which iterations; this is - // needed if there are more threads than required for a certain tilesize or - // when the batchsize is not a multiple of 16. - bool a_sh_wr_pred[a_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) - a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m; - bool s_sh_wr_pred = threadIdx.x < s_sh_stride; - - // To ensure that writing and reading A tiles to/from shared memory, the - // latter in fragment format, is fully bank conflict free, we need to use a - // rather fancy XOR-based layout. The key here is that neither reads nor - // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the - // same shared memory banks. Further, it seems (based on NSight-Compute) that - // each warp must also write a consecutive memory segment? - auto transform_a = [&](int i) { - int row = i / a_gl_rd_delta_o; - return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row; - }; - // Since the computation of this remapping is non-trivial and, due to our main - // loop unrolls, all shared memory accesses are static, we simply precompute - // both transformed reads and writes. - int a_sh_wr_trans[a_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) - a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr); - int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks]; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) { - #pragma unroll - for (int j = 0; j < thread_m_blocks; j++) - a_sh_rd_trans[i][j] = - transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd); - } - - // Since B-accesses have non-constant stride they have to be computed at - // runtime; we break dependencies between subsequent accesses with a tile by - // maintining multiple pointers (we have enough registers), a tiny - // optimization. - const int4* B_ptr[b_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) - B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd; - - extern __shared__ int4 sh[]; - // Shared memory storage for global fetch pipelines. - int4* sh_a = sh; - int4* sh_b = sh_a + (stages * a_sh_stage); - int4* sh_s = sh_b + (stages * b_sh_stage); - // Register storage for double buffer of shared memory reads. - FragA frag_a[2][thread_m_blocks]; - I4 frag_b_quant[2]; - FragC frag_c[thread_m_blocks][4][2]; - FragS frag_s[2][4]; - - // Zero accumulators. - auto zero_accums = [&]() { - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++) - reinterpret_cast(frag_c)[i] = 0; - }; - - // Asynchronously fetch the next A, B and s tile from global to the next - // shared memory pipeline location. - auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) { - if (pred) { - int4* sh_a_stage = sh_a + a_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) { - cp_async4_pred( - &sh_a_stage[a_sh_wr_trans[i]], - &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off], - a_sh_wr_pred[i]); - } - int4* sh_b_stage = sh_b + b_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) { - cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]); - B_ptr[i] += b_gl_rd_delta_o; - } - // Only fetch scales if this tile starts a new group - if constexpr (group_blocks != -1) { - // This assumes group_blocks >= thread_k_blocks - // and would need to be modified to support smaller groups. - static_assert(group_blocks >= thread_k_blocks); - if (pipe % (group_blocks / thread_k_blocks) == 0) { - int4* sh_s_stage = sh_s + s_sh_stage * pipe; - if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]); - s_gl_rd += s_gl_rd_delta; - } - } - } - // Insert a fence even when we are winding down the pipeline to ensure that - // waiting is also correct at this point. - cp_async_fence(); - }; - - // Wait until the next thread tile has been loaded to shared memory. - auto wait_for_stage = [&]() { - // We only have `stages - 2` active fetches since we are double buffering - // and can only issue the next fetch when it is guaranteed that the previous - // shared memory load is fully complete (as it may otherwise be - // overwritten). - cp_async_wait(); - __syncthreads(); - }; - - // Load the next sub-tile from the current location in the shared memory pipe - // into the current register buffer. - auto fetch_to_registers = [&](int k, int pipe) { - // It may seem inefficient that we reload the groups for every sub-tile; - // however, this does not seem to be a significant bottleneck, while some - // theoretically better attempts have lead to bad instruction ordering by - // the compiler and correspondingly a noticeable drop in performance. - if constexpr (group_blocks != -1) { - // This assumes group_blocks >= thread_k_blocks - // and would need to be modified to support smaller groups. - static_assert(group_blocks >= thread_k_blocks); - int4* sh_s_stage = - sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) * - (pipe / (group_blocks / thread_k_blocks))); - reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd]; - } - int4* sh_a_stage = sh_a + a_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) - ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]); - int4* sh_b_stage = sh_b + b_sh_stage * pipe; - frag_b_quant[k % 2] = *reinterpret_cast( - &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]); - }; - - // Execute the actual tensor core matmul of a sub-tile. - auto matmul = [&](int k) { - // We have the m dimension as the inner loop in order to encourage overlapping - // dequantization and matmul operations. - #pragma unroll - for (int j = 0; j < 4; j++) { - int b_quant = frag_b_quant[k % 2][j]; - int b_quant_shift = b_quant >> 8; - FragB frag_b0 = dequant(b_quant); - // If there are no groups, we can just scale the final output once and can - // avoid doing so for each weight. - if (group_blocks != -1) scale(frag_b0, frag_s[k % 2][j], 0); - FragB frag_b1 = dequant(b_quant_shift); - if (group_blocks != -1) scale(frag_b1, frag_s[k % 2][j], 1); - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) { - mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]); - mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]); - } - } - }; - - // Since we slice across the k dimension of a tile in order to increase the - // number of warps while keeping the n dimension of a tile reasonable, we have - // multiple warps that accumulate their partial sums of the same output - // location; which we have to reduce over in the end. We do in shared memory. - auto thread_block_reduce = [&]() { - constexpr int red_off = threads / b_sh_stride / 2; - if (red_off >= 1) { - auto red_idx = threadIdx.x / b_sh_stride; - constexpr int red_sh_stride = b_sh_stride * 4 * 2; - constexpr int red_sh_delta = b_sh_stride; - int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) + - (threadIdx.x % b_sh_stride); - - // Parallel logarithmic shared memory reduction. We make sure to avoid any - // unnecessary read or write iterations, e.g., for two warps we write only - // once by warp 1 and read only once by warp 0. - - #pragma unroll - for (int m_block = 0; m_block < thread_m_blocks; m_block++) { - #pragma unroll - for (int i = red_off; i > 0; i /= 2) { - if (i <= red_idx && red_idx < 2 * i) { - #pragma unroll - for (int j = 0; j < 4 * 2; j++) { - int red_sh_wr = - red_sh_delta * j + (red_sh_rd - red_sh_stride * i); - if (i < red_off) { - float* c_rd = - reinterpret_cast(&sh[red_sh_delta * j + red_sh_rd]); - float* c_wr = reinterpret_cast(&sh[red_sh_wr]); - #pragma unroll - for (int k = 0; k < 4; k++) - reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += - c_rd[k] + c_wr[k]; - } - sh[red_sh_wr] = - reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; - } - } - __syncthreads(); - } - if (red_idx == 0) { - #pragma unroll - for (int i = 0; i < 4 * 2; i++) { - float* c_rd = - reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); - #pragma unroll - for (int j = 0; j < 4; j++) - reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += - c_rd[j]; - } - } - __syncthreads(); - } - } - }; - - // Since multiple threadblocks may process parts of the same column slice, we - // finally have to globally reduce over the results. As the striped - // partitioning minimizes the number of such reductions and our outputs are - // usually rather small, we perform this reduction serially in L2 cache. - auto global_reduce = [&](bool first = false, bool last = false) { - // We are very careful here to reduce directly in the output buffer to - // maximize L2 cache utilization in this step. To do this, we write out - // results in FP16 (but still reduce with FP32 compute). - constexpr int active_threads = 32 * thread_n_blocks / 4; - if (threadIdx.x < active_threads) { - int c_gl_stride = prob_n / 8; - int c_gl_wr_delta_o = 8 * c_gl_stride; - int c_gl_wr_delta_i = 4 * (active_threads / 32); - int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) + - 4 * (threadIdx.x / 32) + threadIdx.x % 4; - c_gl_wr += (2 * thread_n_blocks) * slice_col; - constexpr int c_sh_wr_delta = active_threads; - auto c_sh_wr = threadIdx.x; - - int row = (threadIdx.x % 32) / 4; - - if (!first) { - // Interestingly, doing direct global accesses here really seems to mess up - // the compiler and lead to slowdowns, hence we also use async-copies even - // though these fetches are not actually asynchronous. - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4; i++) { - cp_async4_pred( - &sh[c_sh_wr + c_sh_wr_delta * i], - &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + - c_gl_wr_delta_i * (i % 2)], - i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); - } - cp_async_fence(); - cp_async_wait<0>(); - } - - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4; i++) { - if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) { - if (!first) { - int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta]; - #pragma unroll - for (int j = 0; j < 2 * 4; j++) { - reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] += - __half2float(reinterpret_cast<__half*>(&c_red)[j]); - } - } - if (!last) { - int4 c; - #pragma unroll - for (int j = 0; j < 2 * 4; j++) { - reinterpret_cast<__half*>(&c)[j] = - __float2half(reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]); - } - C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] = - c; - } - } - } - } - }; - - // Write out the reduce final result in the correct layout. We only actually - // reshuffle matrix fragments in this step, the reduction above is performed - // in fragment layout. - auto write_result = [&]() { - int c_gl_stride = prob_n / 8; - constexpr int c_sh_stride = 2 * thread_n_blocks + 1; - int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks)); - constexpr int c_sh_rd_delta = - c_sh_stride * (threads / (2 * thread_n_blocks)); - - int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) + - (threadIdx.x % (2 * thread_n_blocks)); - c_gl_wr += (2 * thread_n_blocks) * slice_col; - int c_sh_wr = - (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4; - c_sh_wr += 32 * (threadIdx.x / 32); - int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) + - (threadIdx.x % (2 * thread_n_blocks)); - - int c_gl_wr_end = c_gl_stride * prob_m; - - // We first reorder in shared memory to guarantee the most efficient final - // global write patterns - auto write = [&](int idx, float c0, float c1, FragS& s) { - half2 res = __halves2half2(__float2half(c0), __float2half(c1)); - if (group_blocks == - -1) // for per-column quantization we finally apply the scale here - res = __hmul2(res, s[0]); - ((half2*)sh)[idx] = res; - }; - if (threadIdx.x / 32 < thread_n_blocks / 4) { - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) { - #pragma unroll - for (int j = 0; j < 4; j++) { - int wr = c_sh_wr + 8 * j; - write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0], - frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]); - write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2], - frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]); - write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0], - frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]); - write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2], - frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]); - } - c_sh_wr += 16 * (4 * c_sh_stride); - } - } - __syncthreads(); - - #pragma unroll - for (int i = 0; - i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); - i++) { - if (c_gl_wr < c_gl_wr_end) { - C[c_gl_wr] = sh[c_sh_rd]; - c_gl_wr += c_gl_wr_delta; - c_sh_rd += c_sh_rd_delta; - } - } - }; - - // Start global fetch and register load pipelines. - auto start_pipes = [&]() { - #pragma unroll - for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters); - zero_accums(); - wait_for_stage(); - fetch_to_registers(0, 0); - a_gl_rd += a_gl_rd_delta_o * (stages - 1); - }; - start_pipes(); - - // Main loop. - while (slice_iters) { - // We unroll over both the global fetch and the register load pipeline to - // ensure all shared memory accesses are static. Note that both pipelines have - // even length meaning that the next iteration will always start at index 0. - #pragma unroll - for (int pipe = 0; pipe < stages;) { - #pragma unroll - for (int k = 0; k < b_sh_wr_iters; k++) { - fetch_to_registers(k + 1, pipe % stages); - if (k == b_sh_wr_iters - 2) { - fetch_to_shared((pipe + stages - 1) % stages, pipe, - slice_iters >= stages); - pipe++; - wait_for_stage(); - } - matmul(k); - } - slice_iters--; - if (slice_iters == 0) break; - } - a_gl_rd += a_gl_rd_delta_o * stages; - - // Process results and, if necessary, proceed to the next column slice. - // While this pattern may not be the most readable, other ways of writing - // the loop seemed to noticeably worse performance after compilation. - if (slice_iters == 0) { - cp_async_wait<0>(); - bool last = slice_idx == slice_count - 1; - // For per-column scales, we only fetch them here in the final step before - // write-out - if (group_blocks == -1 && last) { - if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]); - cp_async_fence(); - } - thread_block_reduce(); - if (group_blocks == -1 && last) { - cp_async_wait<0>(); - __syncthreads(); - if (threadIdx.x / 32 < thread_n_blocks / 4) { - reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd + 0]; - reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; - } - } - if (slice_count > 1) { // only globally reduce if there is more than one - // block in a slice - barrier_acquire(&locks[slice_col], slice_idx); - global_reduce(slice_idx == 0, last); - barrier_release(&locks[slice_col], last); - } - if (last) // only the last block in a slice actually writes the result - write_result(); - slice_row = 0; - slice_col_par++; - slice_col++; - init_slice(); - if (slice_iters) { - a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) - B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles; - if (slice_col == 0) { - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride; - } - s_gl_rd = s_sh_stride * slice_col + threadIdx.x; - start_pipes(); - } - } - } -} - -#else - -template shared - // fetch pipeline - const int group_blocks = -1 // number of consecutive 16x16 blocks - // with a separate quantization scale - > -__global__ void Marlin( - const int4* __restrict__ A, // fp16 input matrix of shape mxk - const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn - int4* __restrict__ C, // fp16 output buffer of shape mxn - const int4* __restrict__ s, // fp16 quantization scales of shape - // (k/groupsize)xn - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int* locks // extra global storage for barrier synchronization -) { - // Marlin is not implemented yet for SM < 8.0 - assert(false); - return; -} - -#endif - -// 8 warps are a good choice since every SM has 4 schedulers and having more -// than 1 warp per schedule allows some more latency hiding. At the same time, -// we want relatively few warps to have many registers per warp and small tiles. -const int USER_THREADS = - 256; // Note: This is only used with user-provided thread_k/n -const int STAGES = 4; // 4 pipeline stages fit into shared memory -const int SHARED_MEM = - 96 * 1024; // max shared memory on compute capability 8.6 (< 8.0) - -static constexpr int min_thread_n = 64; -static constexpr int min_thread_k = 64; - -static constexpr int tile_size = 16; -static constexpr int max_par = 16; - -static constexpr int pack_factor_4bit = - 8; // We have 8 4-bit vals inside a 32 bit - -#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ - GROUP_BLOCKS, NUM_THREADS) \ - else if (thread_m_blocks == THREAD_M_BLOCKS && \ - thread_n_blocks == THREAD_N_BLOCKS && \ - thread_k_blocks == THREAD_K_BLOCKS && \ - group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) { \ - cudaFuncSetAttribute(Marlin, \ - cudaFuncAttributeMaxDynamicSharedMemorySize, \ - SHARED_MEM); \ - Marlin<<>>( \ - A_ptr, B_ptr, C_ptr, s_ptr, prob_m, prob_n, prob_k, locks); \ - } - -typedef struct { - int thread_k; - int thread_n; - int num_threads; -} thread_config_t; - -thread_config_t small_batch_thread_configs[] = { - // Ordered by priority - - // thread_k, thread_n, num_threads - {128, 128, 256}, // Default - {128, 64, 128}, // Reduce N 2X, same K - {64, 256, 256}, // Reduce K 2X, increase N 2X - {64, 128, 128}, // Reduce K 2X, same N -}; - -thread_config_t large_batch_thread_configs[] = { - // Ordered by priority - - // thread_k, thread_n, num_threads - {64, 256, 256}, // Default - {128, 128, 256}, // Reduce N 2X, increase K 2X - {64, 128, 128}, // Reduce N 2X, same K - {128, 64, 128}, // Reduce N 4X, increase K 2X -}; - -bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n, - int prob_k) { - // Sanity - if (th_config.thread_k == -1 || th_config.thread_n == -1 || - th_config.num_threads == -1) { - return false; - } - - // Verify K/N are divisible by thread K/N - if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) { - return false; - } - - // thread_k can be only 128 or 64 (because it must be less than groupsize - // which is 128) - if (th_config.thread_k != 128 && th_config.thread_k != 64) { - return false; - } - - // Verify min for thread K/N - if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) { - return false; - } - - // num_threads must be at least 128 (= 4 warps) - if (th_config.num_threads < 128) { - return false; - } - - return true; -} - -thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) { - if (prob_m <= 16) { - for (auto th_config : small_batch_thread_configs) { - if (is_valid_config(th_config, prob_m, prob_n, prob_k)) { - return th_config; - } - } - - } else { - for (auto th_config : large_batch_thread_configs) { - if (is_valid_config(th_config, prob_m, prob_n, prob_k)) { - return th_config; - } - } - } - - return thread_config_t{-1, -1, -1}; -} - -#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) - -void marlin_cuda(const void* A, const void* B, void* C, void* s, int prob_m, - int prob_n, int prob_k, void* workspace, int groupsize = -1, - int dev = 0, cudaStream_t stream = 0, int thread_k = -1, - int thread_n = -1, int sms = -1, int max_par = 16) { - int tot_m = prob_m; - int tot_m_blocks = ceildiv(tot_m, 16); - int pad = 16 * tot_m_blocks - tot_m; - - if (sms == -1) - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev); - - // Set thread config - thread_config_t th_config; - if (thread_k != -1 && thread_n != -1) { - // User-defined config - th_config = thread_config_t{thread_k, thread_n, USER_THREADS}; - } else { - // Auto config - th_config = determine_thread_config(prob_m, prob_n, prob_k); - } - - if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) { - throw std::runtime_error( - "Invalid thread config: thread_k = " + str(th_config.thread_k) + - ", thread_n = " + str(th_config.thread_n) + - ", num_threads = " + str(th_config.num_threads) + " for MKN = [" + - str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]"); - } - - // Uncomment for debug - // std::cout << "Using thread_config: thread_k = " + str(th_config.thread_k) + - // ", thread_n = " + str(th_config.thread_n) + - // ", num_threads = " + str(th_config.num_threads) + " for - // MKN = [" + str(prob_m) + - // ", " + str(prob_k) + ", " + str(prob_n) + "]\n"; - - int num_threads = th_config.num_threads; - thread_k = th_config.thread_k; - thread_n = th_config.thread_n; - - int thread_k_blocks = thread_k / 16; - int thread_n_blocks = thread_n / 16; - int group_blocks = (groupsize == -1) ? -1 : groupsize / 16; - int blocks = sms; - - if (prob_m == 0 || prob_n == 0 || prob_k == 0) { - return; - } - - TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n, - " is not divisible by thread_n = ", thread_n); - TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k, - " is not divisible by thread_k = ", thread_k); - if (group_blocks != -1) { - TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k, - " is not divisible by group_blocks = ", group_blocks); - } - - const int4* A_ptr = (const int4*)A; - const int4* B_ptr = (const int4*)B; - int4* C_ptr = (int4*)C; - const int4* s_ptr = (const int4*)s; - - int* locks = (int*)workspace; - - for (int i = 0; i < tot_m_blocks; i += 4) { - int thread_m_blocks = tot_m_blocks - i; - prob_m = tot_m - 16 * i; - int par = 1; - if (thread_m_blocks > 4) { - // Note that parallel > 1 currently only works for inputs without any - // padding - par = (16 * thread_m_blocks - pad) / 64; - if (par > max_par) par = max_par; - prob_m = 64 * par; - i += 4 * (par - 1); - thread_m_blocks = 4; - } - - // For compilation speed, we only define the kernel configurations that have - // seemed useful (in terms of performance) in our testing, however many more - // are, in principle, possible. - if (false) { - } - CALL_IF(8, 8, 256) - CALL_IF(16, 4, 256) - CALL_IF(8, 4, 128) - CALL_IF(4, 8, 128) - else { - throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) + - ", " + str(prob_k) + ", " + str(prob_n) + "]" + - ", groupsize = " + str(groupsize) + - ", thread_m_blocks = " + str(thread_m_blocks) + - ", thread_n_blocks = " + str(thread_n_blocks) + - ", thread_k_blocks = " + str(thread_k_blocks)); - } - - A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par; - C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par; - } -} - -} // namespace marlin_dense - -torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, - torch::Tensor& b_scales, torch::Tensor& workspace, - int64_t size_m, int64_t size_n, int64_t size_k) { - // Verify M - TORCH_CHECK(size_m == a.size(0), - "Shape mismatch: a.size(0) = " + str(a.size(0)) + - ", size_m = " + str(size_m)); - - // Verify K - TORCH_CHECK(size_k == a.size(1), - "Shape mismatch: a.size(1) = " + str(a.size(1)) + - ", size_k = " + str(size_k)); - TORCH_CHECK(size_k % marlin_dense::tile_size == 0, - "size_k = " + str(size_k) + " is not divisible by tile_size = " + - str(marlin_dense::tile_size)); - TORCH_CHECK((size_k / marlin_dense::tile_size) == b_q_weight.size(0), - "Shape mismatch: b_q_weight.size(0) = " + - str(b_q_weight.size(0)) + ", size_k = " + str(size_k) + - ", tile_size = " + str(marlin_dense::tile_size)); - - // Verify N - TORCH_CHECK(b_scales.size(1) == size_n, - "b_scales.size(1) = " + str(b_scales.size(1)) + - ", size_n = " + str(size_n)); - TORCH_CHECK( - b_q_weight.size(1) % marlin_dense::tile_size == 0, - "b_q_weight.size(1) = " + str(b_q_weight.size(1)) + - " is not divisible by tile_size = " + str(marlin_dense::tile_size)); - - int actual_size_n = (b_q_weight.size(1) / marlin_dense::tile_size) * - marlin_dense::pack_factor_4bit; - TORCH_CHECK( - size_n == actual_size_n, - "size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n)); - - // Verify A device and strides - TORCH_CHECK(a.device().is_cuda(), "A is not on GPU"); - TORCH_CHECK(a.is_contiguous(), "A is not contiguous"); - - // Verify B device and strides - TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU"); - TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous"); - - // Verify scales device and strides - TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU"); - TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous"); - - // Alloc C matrix - const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); - auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device()); - torch::Tensor c = torch::empty({size_m, size_n}, options); - - // thread_k: `k` size of a thread_tile in `weights` (can usually be left as - // auto -1) - int thread_k = -1; - // thread_n: `n` size of a thread_tile in `weights` (can usually be left as - // auto -1) - int thread_n = -1; - // sms: number of SMs to use for the kernel (can usually be left as auto -1) - int sms = -1; - - // Detect groupsize - if (b_scales.size(0) != 1) { - TORCH_CHECK(size_k % b_scales.size(0) == 0, - "size_k = " + str(size_k) + - ", is not divisible by b_scales.size(0) = " + - str(b_scales.size(0))); - } - int groupsize = b_scales.size(0) == 1 ? -1 : size_k / b_scales.size(0); - - // Verify groupsize - TORCH_CHECK(groupsize == -1 || groupsize == 128, - "Unexpected groupsize = " + str(groupsize)); - - // Verify workspace size - TORCH_CHECK(size_n % marlin_dense::min_thread_n == 0, - "size_n = " + str(size_n) + - ", is not divisible by min_thread_n = " + - str(marlin_dense::min_thread_n)); - int min_workspace_size = - (size_n / marlin_dense::min_thread_n) * marlin_dense::max_par; - TORCH_CHECK(workspace.numel() >= min_workspace_size, - "workspace.numel = " + str(workspace.numel()) + - " is below min_workspace_size = " + str(min_workspace_size)); - - int dev = a.get_device(); - marlin_dense::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), - b_scales.data_ptr(), size_m, size_n, size_k, - workspace.data_ptr(), groupsize, dev, - at::cuda::getCurrentCUDAStream(dev), thread_k, - thread_n, sms, marlin_dense::max_par); - - return c; -} - -TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { - m.impl("marlin_gemm", &marlin_gemm); -} diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu deleted file mode 100644 index c96d68d9b2..0000000000 --- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu +++ /dev/null @@ -1,1248 +0,0 @@ -/* - * Adapted from - * https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda_kernel.cu - * https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda.cpp - * Modified by HandH1998 - * Copyright (C) 2024 HandH1998 - * Copyright (C) Marlin.2024 Elias Frantar - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include - -#include - -#include "../dense/common/base.h" -#include "core/registration.h" - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - #include "../dense/common/mem.h" -#endif - -template -inline std::string str(T x) { - return std::to_string(x); -} - -namespace { - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - -using I4 = Vec; -// Matrix fragments for tensor core instructions; their precise layout is -// documented here: -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-integer-type -using FragA = Vec; -using FragB = Vec; -using FragC = Vec; -using FragS_GROUP = Vec; // weight per-group quantization scales -using FragS_CHANNEL = - Vec; // weight per-channel quantization scales or activaton - // per-token quantization scales - -// NOTE(HandH1998): cp.async.cg only support BYTES = 16, however, -// cp.async.ca can support BYTES = 4, 8, 16; -// as s_tok's shape is equal to prob_m, we need set s_tok to float type, -// and cp_size = 1 float, i.e., 4 BYTES -// Asynchronous global->shared copy for activation quantizaton scales s_tok -__device__ inline void cp_async1(void* smem_ptr, const void* glob_ptr) { - const int BYTES = 4; - uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile( - "{\n" - " cp.async.ca.shared.global [%0], [%1], %2;\n" - "}\n" ::"r"(smem), - "l"(glob_ptr), "n"(BYTES)); -} - -// m16n8k16 tensor core mma instruction with int8 inputs and int32 -// output/accumulation. -__device__ inline void mma(const FragA& a_frag, const FragB& frag_b, - FragC& frag_c) { - const uint32_t* a = reinterpret_cast(&a_frag); - const uint32_t* b = reinterpret_cast(&frag_b); - int* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.satfinite.s32.s8.s8.s32 " - "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" - : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3]) - : "r"(a[0]), "r"(a[1]), "r"(b[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]), - "r"(c[3])); -} - -// Instruction for loading a full 16x16 matrix fragment of operand A from shared -// memory, directly in int8 tensor core layout. -__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) { - uint32_t* a = reinterpret_cast(&frag_a); - uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n" - : "=r"(a[0]), "=r"(a[1]) - : "r"(smem)); -} - -inline __device__ half2 float2_to_half2(float2 f) { - uint32_t res; - // NOTE(HandH1998): h0,h1 should be uint16_t, not half - uint16_t h0, h1; - asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(h0) : "f"(f.x)); - asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(h1) : "f"(f.y)); - asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(res) : "h"(h0), "h"(h1)); - return reinterpret_cast(res); -} - -inline __device__ float int32_to_float(int h) { - float res; - asm volatile("cvt.rn.f32.s32 %0, %1;\n" : "=f"(res) : "r"(h)); - return res; -} - -// Lookup-table based 3-input logical operation; explicitly used for -// dequantization as the compiler does not seem to automatically recognize it in -// all cases. -template -__device__ inline int lop3(int a, int b, int c) { - int res; - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" - : "=r"(res) - : "r"(a), "r"(b), "r"(c), "n"(lut)); - return res; -} - -// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values -// for weight per channel dequant. -__device__ inline FragB dequant_per_channel(int q) { - static constexpr int MASK = 0xf0f0f0f0; - FragB frag_b; - frag_b[0] = (q & MASK); - return frag_b; -} - -// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values -// for weight per group dequant. -__device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) { - static constexpr uint32_t LO = 0x000f000f; - static constexpr uint32_t HI = 0x00f000f0; - static constexpr uint32_t EX = 0x64006400; - // Guarantee that the `(a & b) | c` operations are LOP3s. - uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); - // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point - // directly into `SUB` and `ADD`. - static constexpr uint32_t SUB = 0x64086408; - static constexpr uint32_t MUL = 0x2c002c00; - static constexpr uint32_t ADD = 0xd480d480; - *reinterpret_cast(&t0) = __hsub2( - *reinterpret_cast(&t0), *reinterpret_cast(&SUB)); - *reinterpret_cast(&t1) = __hfma2( - *reinterpret_cast(&t1), *reinterpret_cast(&MUL), - *reinterpret_cast(&ADD)); - - uint16_t s = reinterpret_cast(&frag_s)[i]; - uint32_t double_s; - // pack 2xfp16 to half2 - asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(double_s) : "h"(s), "h"(s)); - // dequant and convert 4 half to 4 uint8 (be placed at the low 8 bits of 4 - // half, respectively) - static constexpr uint32_t MAGIC_NUM = 0x64806480; - *reinterpret_cast(&t0) = __hfma2( - *reinterpret_cast(&t0), *reinterpret_cast(&double_s), - *reinterpret_cast(&MAGIC_NUM)); - *reinterpret_cast(&t1) = __hfma2( - *reinterpret_cast(&t1), *reinterpret_cast(&double_s), - *reinterpret_cast(&MAGIC_NUM)); - // take out the 4 uint8 from 4 half, then convert them to 4 int8 and pack 4 - // int8 into 1 uint32 - FragB frag_b; - uint32_t uint8s; - static constexpr uint32_t MASK_0246 = 0x6420; - static constexpr uint32_t UINT8s_TO_INT8s_MASK = 0x80808080; - asm volatile("prmt.b32 %0,%1,%2,%3;\n" - : "=r"(uint8s) - : "r"(t0), "r"(t1), "n"(MASK_0246)); - frag_b[0] = (uint8s ^ UINT8s_TO_INT8s_MASK); - return frag_b; -} - -template shared - // fetch pipeline - const int group_blocks = -1 // number of consecutive 16x16 blocks - // with a separate quantization scale - > -__global__ void Marlin( - const int4* __restrict__ A, // int8 input matrix of shape mxk - const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn - int4* __restrict__ C, // int32 global_reduce buffer of shape - // (max_par*16*4)xn, as int8 tensor core's output is - // int32 dtype - int4* __restrict__ D, // fp16 output buffer of shape mxn - const float* __restrict__ s_tok, // fp32 activation per-token quantization - // scales of shape mx1 - const int4* __restrict__ s_ch, // fp32 weight per-channel quantization - // scales of shape 1xn - const int4* __restrict__ s_group, // fp16 weight per-group quantization - // scales of shape (k/groupsize)xn, when - // group_blocks=-1, it should be nullptr - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int* locks // extra global storage for barrier synchronization -) { - // Each threadblock processes one "stripe" of the B matrix with (roughly) the - // same size, which might involve multiple column "slices" (of width 16 * - // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM - // example: - // 0 1 3 - // 0 2 3 - // 1 2 4 - // While this kind of partitioning makes things somewhat more complicated, it - // ensures good utilization of all SMs for many kinds of shape and GPU - // configurations, while requiring as few slow global cross-threadblock - // reductions as possible. - - // For larger GEMMs we run multiple batchsize 64 versions in parallel for a - // better partitioning with less reductions - int parallel = 1; - if (prob_m > 16 * thread_m_blocks) { - parallel = prob_m / (16 * thread_m_blocks); - prob_m = 16 * thread_m_blocks; - } - - int k_tiles = prob_k / 16 / thread_k_blocks; - int n_tiles = prob_n / 16 / thread_n_blocks; - int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x); - // Ensure that the number of tiles in each stripe is a multiple of the - // groupsize; this avoids an annoying special case where a stripe starts in - // the middle of group. - if constexpr (group_blocks != -1) - iters = (group_blocks / thread_k_blocks) * - ceildiv(iters, (group_blocks / thread_k_blocks)); - - int slice_row = (iters * blockIdx.x) % k_tiles; - int slice_col_par = (iters * blockIdx.x) / k_tiles; - int slice_col = slice_col_par; - int slice_iters; // number of threadblock tiles in the current slice - int slice_count = - 0; // total number of active threadblocks in the current slice - int slice_idx; // index of threadblock in current slice; numbered bottom to - // top - - // We can easily implement parallel problem execution by just remapping - // indices and advancing global pointers - if (slice_col_par >= n_tiles) { - A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 16; - C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 4; - D += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8; - s_tok += (slice_col_par / n_tiles) * 16 * thread_m_blocks; - locks += (slice_col_par / n_tiles) * n_tiles; - slice_col = slice_col_par % n_tiles; - } - - // Compute all information about the current slice which is required for - // synchronization. - auto init_slice = [&]() { - slice_iters = - iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row); - if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0; - if (slice_iters == 0) return; - if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row; - slice_count = 1; - slice_idx = 0; - int col_first = iters * ceildiv(k_tiles * slice_col_par, iters); - if (col_first <= k_tiles * (slice_col_par + 1)) { - int col_off = col_first - k_tiles * slice_col_par; - slice_count = ceildiv(k_tiles - col_off, iters); - if (col_off > 0) slice_count++; - int delta_first = iters * blockIdx.x - col_first; - if (delta_first < 0 || (col_off == 0 && delta_first == 0)) - slice_idx = slice_count - 1; - else { - slice_idx = slice_count - 1 - delta_first / iters; - if (col_off > 0) slice_idx--; - } - } - if (slice_col == n_tiles) { - A += 16 * thread_m_blocks * prob_k / 16; - C += 16 * thread_m_blocks * prob_n / 4; - D += 16 * thread_m_blocks * prob_n / 8; - s_tok += 16 * thread_m_blocks; - locks += n_tiles; - slice_col = 0; - } - }; - init_slice(); - - int a_gl_stride = prob_k / 16; // stride of the A matrix in global memory - // We typically use `constexpr` to indicate that this value is a compile-time - // constant - constexpr int a_sh_stride = - 16 * thread_k_blocks / 16; // stride of an A matrix tile in shared memory - constexpr int a_gl_rd_delta_o = - 16 * thread_k_blocks / - 16; // delta between subsequent A tiles in global memory - int a_gl_rd_delta_i = - a_gl_stride * - (threads / a_gl_rd_delta_o); // between subsequent accesses within a tile - constexpr int a_sh_wr_delta = - a_sh_stride * - (threads / a_gl_rd_delta_o); // between shared memory writes - constexpr int a_sh_rd_delta_o = - 1 * ((threads / 32) / - (thread_n_blocks / 4)); // between shared memory tile reads - constexpr int a_sh_rd_delta_i = - a_sh_stride * 16; // within a shared memory tile - constexpr int a_sh_stage = - a_sh_stride * (16 * thread_m_blocks); // overall size of a tile - constexpr int a_sh_wr_iters = - ceildiv(a_sh_stage, - a_sh_wr_delta); // number of shared write iterations for a tile - - int b_gl_stride = 16 * prob_n / 32; - constexpr int b_sh_stride = 32 * thread_n_blocks / 4; - int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks; - int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride); - constexpr int b_sh_wr_delta = threads; - constexpr int b_sh_rd_delta = threads; - constexpr int b_sh_stage = b_sh_stride * thread_k_blocks; - constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta; - - constexpr int s_tok_sh_stride = 16 * thread_m_blocks; - - constexpr int s_ch_sh_stride = 16 * thread_n_blocks / 4; - - int s_group_gl_stride = prob_n / 8; - constexpr int s_group_sh_stride = 16 * thread_n_blocks / 8; - constexpr int s_group_sh_stage = s_group_sh_stride; - int s_group_gl_rd_delta = s_group_gl_stride; - - // Global A read index of current thread. - int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - a_gl_rd += a_gl_rd_delta_o * slice_row; - // Shared write index of current thread. - int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - // Shared read index. - // NOTE(HandH1998): int8 input a only need 16 threads to load 16x16 matrix - int a_sh_rd = a_sh_stride * ((threadIdx.x % 32) % 16); - a_sh_rd += 1 * ((threadIdx.x / 32) / (thread_n_blocks / 4)); - - int b_gl_rd = - b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride); - b_gl_rd += b_sh_stride * slice_col; - b_gl_rd += b_gl_rd_delta_o * slice_row; - auto b_sh_wr = threadIdx.x; - auto b_sh_rd = threadIdx.x; - - auto s_tok_gl_rd = threadIdx.x; - // NOTE(HandH1998): activation scale s_tok need shuffle to [0, 8, 1, 9, 2, 10, - // 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] for example, 0, 8 row scales serve for - // thread 0, 1, 2, 3. For more details, refer to mma operand A layout as - // s_tok's size is not fixed, we can not shuffle before inference we shuffle - // it when fetching s_tok from global memory to shared memory, that's why - // s_tok_sh_wr is like this - int s_tok_sh_wr = - (threadIdx.x / 16) * 16 + (threadIdx.x % 8) * 2 + (threadIdx.x % 16) / 8; - int s_tok_sh_rd = (threadIdx.x % 32) / 4; - bool s_tok_sh_wr_pred = threadIdx.x < prob_m; - - auto s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x; - auto s_ch_sh_wr = threadIdx.x; - int s_ch_sh_rd = 16 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - 2 * ((threadIdx.x % 32) % 4); - bool s_ch_sh_wr_pred = threadIdx.x < s_ch_sh_stride; - - int s_group_gl_rd, s_group_sh_wr, s_group_sh_rd; - bool s_group_sh_wr_pred; - if constexpr (group_blocks != -1) { - s_group_gl_rd = - s_group_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) + - s_group_sh_stride * slice_col + threadIdx.x; - s_group_sh_wr = threadIdx.x; - // NOTE(HandH1998): s_group_sh_rd is related to mma output C - s_group_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - (threadIdx.x % 32) / 4; - s_group_sh_wr_pred = threadIdx.x < s_group_sh_stride; - } - - // Precompute which thread should not read memory in which iterations; this is - // needed if there are more threads than required for a certain tilesize or - // when the batchsize is not a multiple of 16. - bool a_sh_wr_pred[a_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) - a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m; - - // To ensure that writing and reading A tiles to/from shared memory, the - // latter in fragment format, is fully bank conflict free, we need to use a - // rather fancy XOR-based layout. The key here is that neither reads nor - // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the - // same shared memory banks. Further, it seems (based on NSight-Compute) that - // each warp must also write a consecutive memory segment? - auto transform_a = [&](int i) { - int row = i / a_gl_rd_delta_o; - return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row; - }; - // Since the computation of this remapping is non-trivial and, due to our main - // loop unrolls, all shared memory accesses are static, we simply precompute - // both transformed reads and writes. - int a_sh_wr_trans[a_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) - a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr); - int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks]; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) { - #pragma unroll - for (int j = 0; j < thread_m_blocks; j++) - a_sh_rd_trans[i][j] = - transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd); - } - - // Since B-accesses have non-constant stride they have to be computed at - // runtime; we break dependencies between subsequent accesses with a tile by - // maintining multiple pointers (we have enough registers), a tiny - // optimization. - const int4* B_ptr[b_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) - B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd; - - extern __shared__ int4 sh[]; - // Shared memory storage for global fetch pipelines. - // NOTE(HandH1998): stages need >= 4, otherwise, sh_s_tok = sh + max(stages * - // a_sh_stage + stages * b_sh_stage, 4 * stages * a_sh_stage) - int4* sh_a = sh; - int4* sh_b = sh_a + (stages * a_sh_stage); - int4* sh_s_tok = sh_b + (stages * b_sh_stage); - int4* sh_s_ch = sh_s_tok + s_tok_sh_stride; - int4* sh_s_group = sh_s_ch + s_ch_sh_stride; - - // Register storage for double buffer of shared memory reads. - FragA frag_a[2][thread_m_blocks]; - I4 frag_b_quant[2]; - FragC frag_c[thread_m_blocks][4][2]; - FragS_GROUP frag_s_group[2][4]; - FragS_CHANNEL frag_s_tok[thread_m_blocks]; - FragS_CHANNEL frag_s_ch[2][4]; - - // Zero accumulators. - auto zero_accums = [&]() { - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++) - reinterpret_cast(frag_c)[i] = 0; - }; - - // Asynchronously fetch the next A, B and s tile from global to the next - // shared memory pipeline location. - auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) { - if (pred) { - int4* sh_a_stage = sh_a + a_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) { - cp_async4_pred( - &sh_a_stage[a_sh_wr_trans[i]], - &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off], - a_sh_wr_pred[i]); - } - int4* sh_b_stage = sh_b + b_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) { - cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]); - B_ptr[i] += b_gl_rd_delta_o; - } - // Only fetch scales if this tile starts a new group - if constexpr (group_blocks != -1) { - if (pipe % (group_blocks / thread_k_blocks) == 0) { - int4* sh_s_group_stage = sh_s_group + s_group_sh_stage * pipe; - if (s_group_sh_wr_pred) - cp_async4(&sh_s_group_stage[s_group_sh_wr], - &s_group[s_group_gl_rd]); - s_group_gl_rd += s_group_gl_rd_delta; - } - } - } - // Insert a fence even when we are winding down the pipeline to ensure that - // waiting is also correct at this point. - cp_async_fence(); - }; - - // Wait until the next thread tile has been loaded to shared memory. - auto wait_for_stage = [&]() { - // We only have `stages - 2` active fetches since we are double buffering - // and can only issue the next fetch when it is guaranteed that the previous - // shared memory load is fully complete (as it may otherwise be - // overwritten). - cp_async_wait(); - __syncthreads(); - }; - - // Load the next sub-tile from the current location in the shared memory pipe - // into the current register buffer. - auto fetch_to_registers = [&](int k, int pipe) { - // It may seem inefficient that we reload the groups for every sub-tile; - // however, this does not seem to be a significant bottleneck, while some - // theoretically better attempts have lead to bad instruction ordering by - // the compiler and correspondingly a noticeable drop in performance. - if constexpr (group_blocks != -1) { - int4* sh_s_group_stage = - sh_s_group + - s_group_sh_stage * ((group_blocks / thread_k_blocks) * - (pipe / (group_blocks / thread_k_blocks))); - reinterpret_cast(&frag_s_group[k % 2])[0] = - sh_s_group_stage[s_group_sh_rd]; - } - int4* sh_a_stage = sh_a + a_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) - ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]); - int4* sh_b_stage = sh_b + b_sh_stage * pipe; - frag_b_quant[k % 2] = *reinterpret_cast( - &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]); - }; - - // Execute the actual tensor core matmul of a sub-tile. - auto matmul = [&](int k) { - // We have the m dimension as the inner loop in order to encourage overlapping - // dequantization and matmul operations. - #pragma unroll - for (int j = 0; j < 4; j++) { - int b_quant = frag_b_quant[k % 2][j]; - // int b_quant_shift = b_quant << 4; - FragB frag_b0, frag_b1; - // If there are no groups, we can just scale the final output once and can - // avoid doing so for each weight. - if constexpr (group_blocks != -1) { - int b_quant_shift = b_quant >> 8; - frag_b0 = dequant_per_group(b_quant, frag_s_group[k % 2][j], 0); - frag_b1 = dequant_per_group(b_quant_shift, frag_s_group[k % 2][j], 1); - } else { - int b_quant_shift = b_quant << 4; - frag_b0 = dequant_per_channel(b_quant); - frag_b1 = dequant_per_channel(b_quant_shift); - } - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) { - mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]); - mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]); - } - } - }; - - // Since we slice across the k dimension of a tile in order to increase the - // number of warps while keeping the n dimension of a tile reasonable, we have - // multiple warps that accumulate their partial sums of the same output - // location; which we have to reduce over in the end. We do in shared memory. - auto thread_block_reduce = [&]() { - constexpr int red_off = threads / b_sh_stride / 2; - if (red_off >= 1) { - auto red_idx = threadIdx.x / b_sh_stride; - constexpr int red_sh_stride = b_sh_stride * 4 * 2; - constexpr int red_sh_delta = b_sh_stride; - int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) + - (threadIdx.x % b_sh_stride); - - // Parallel logarithmic shared memory reduction. We make sure to avoid any - // unnecessary read or write iterations, e.g., for two warps we write only - // once by warp 1 and read only once by warp 0. - - #pragma unroll - for (int m_block = 0; m_block < thread_m_blocks; m_block++) { - #pragma unroll - for (int i = red_off; i > 0; i /= 2) { - if (i <= red_idx && red_idx < 2 * i) { - #pragma unroll - for (int j = 0; j < 4 * 2; j++) { - int red_sh_wr = - red_sh_delta * j + (red_sh_rd - red_sh_stride * i); - if (i < red_off) { - int* c_rd = - reinterpret_cast(&sh[red_sh_delta * j + red_sh_rd]); - int* c_wr = reinterpret_cast(&sh[red_sh_wr]); - #pragma unroll - for (int k = 0; k < 4; k++) - reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += - c_rd[k] + c_wr[k]; - } - sh[red_sh_wr] = - reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; - } - } - __syncthreads(); - } - if (red_idx == 0) { - #pragma unroll - for (int i = 0; i < 4 * 2; i++) { - int* c_rd = - reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); - #pragma unroll - for (int j = 0; j < 4; j++) - reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += - c_rd[j]; - } - } - __syncthreads(); - } - } - }; - - // Since multiple threadblocks may process parts of the same column slice, we - // finally have to globally reduce over the results. As the striped - // partitioning minimizes the number of such reductions and our outputs are - // usually rather small, we perform this reduction serially in L2 cache. - // global_reduce works on INT32 elements, which are the results of INT8 GEMM. - // This is why we need another INT32 maxtrix `C` to reduce instead of the - // original half matrix `D`. - auto global_reduce = [&](bool first = false, bool last = false) { - // We are very careful here to reduce directly in the output buffer to - // maximize L2 cache utilization in this step. To do this, we write out - // results in FP16 (but still reduce with FP32 compute). - constexpr int active_threads = 32 * thread_n_blocks / 4; - if (threadIdx.x < active_threads) { - int c_gl_stride = prob_n / 4; - int c_gl_wr_delta_o = 8 * c_gl_stride; - int c_gl_wr_delta_i = 8 * (active_threads / 32); - int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) + - 8 * (threadIdx.x / 32) + (threadIdx.x % 4) * 2; - c_gl_wr += (4 * thread_n_blocks) * slice_col; - constexpr int c_sh_wr_delta = active_threads * 2; - auto c_sh_wr = 2 * threadIdx.x; - - int row = (threadIdx.x % 32) / 4; - - if (!first) { - // Interestingly, doing direct global accesses here really seems to mess up - // the compiler and lead to slowdowns, hence we also use async-copies even - // though these fetches are not actually asynchronous. - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4; i++) { - cp_async4_pred( - &sh[c_sh_wr + c_sh_wr_delta * i], - &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + - c_gl_wr_delta_i * (i % 2)], - i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); - cp_async4_pred( - &sh[c_sh_wr + c_sh_wr_delta * i + 1], - &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + - c_gl_wr_delta_i * (i % 2) + 1], - i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); - } - cp_async_fence(); - cp_async_wait<0>(); - } - - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4; i++) { - if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) { - if (!first) { - int4 d_red1 = sh[c_sh_wr + i * c_sh_wr_delta]; - int4 d_red2 = sh[c_sh_wr + i * c_sh_wr_delta + 1]; - #pragma unroll - for (int j = 0; j < 4; j++) { - reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] += - reinterpret_cast(&d_red1)[j]; - } - #pragma unroll - for (int j = 0; j < 4; j++) { - reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * (j + 4) + (i % 4)] += - reinterpret_cast(&d_red2)[j]; - } - } - if (!last) { - int4 d1, d2; - #pragma unroll - for (int j = 0; j < 4; j++) { - reinterpret_cast(&d1)[j] = reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]; - } - #pragma unroll - for (int j = 0; j < 4; j++) { - reinterpret_cast(&d2)[j] = reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * (j + 4) + (i % 4)]; - } - C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] = - d1; - C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2) + - 1] = d2; - } - } - } - } - }; - - // Write out the reduce final result in the correct layout. We only actually - // reshuffle matrix fragments in this step, the reduction above is performed - // in fragment layout. - auto write_result = [&]() { - int d_gl_stride = prob_n / 8; - constexpr int d_sh_stride = 2 * thread_n_blocks + 1; - int d_gl_wr_delta = d_gl_stride * (threads / (2 * thread_n_blocks)); - constexpr int d_sh_rd_delta = - d_sh_stride * (threads / (2 * thread_n_blocks)); - - int d_gl_wr = d_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) + - (threadIdx.x % (2 * thread_n_blocks)); - d_gl_wr += (2 * thread_n_blocks) * slice_col; - int d_sh_wr = - (4 * d_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4; - d_sh_wr += 32 * (threadIdx.x / 32); - int d_sh_rd = d_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) + - (threadIdx.x % (2 * thread_n_blocks)); - - int d_gl_wr_end = d_gl_stride * prob_m; - - // We first reorder in shared memory to guarantee the most efficient final - // global write patterns - auto write = [&](int idx, int c0, int c1, float a_s, FragS_CHANNEL& w_s) { - float2 deq_res; - deq_res.x = int32_to_float(c0) * w_s[0] * a_s; - deq_res.y = int32_to_float(c1) * w_s[1] * a_s; - ((half2*)sh)[idx] = float2_to_half2(deq_res); - }; - - if (threadIdx.x / 32 < thread_n_blocks / 4) { - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) { - #pragma unroll - for (int j = 0; j < 4; j++) { - int wr = d_sh_wr + 8 * j; - write(wr + (4 * d_sh_stride) * 0 + 0, frag_c[i][j][0][0], - frag_c[i][j][0][1], frag_s_tok[i][0], - frag_s_ch[j / 2][2 * (j % 2) + 0]); - write(wr + (4 * d_sh_stride) * 8 + 0, frag_c[i][j][0][2], - frag_c[i][j][0][3], frag_s_tok[i][1], - frag_s_ch[j / 2][2 * (j % 2) + 0]); - write(wr + (4 * d_sh_stride) * 0 + 4, frag_c[i][j][1][0], - frag_c[i][j][1][1], frag_s_tok[i][0], - frag_s_ch[j / 2][2 * (j % 2) + 1]); - write(wr + (4 * d_sh_stride) * 8 + 4, frag_c[i][j][1][2], - frag_c[i][j][1][3], frag_s_tok[i][1], - frag_s_ch[j / 2][2 * (j % 2) + 1]); - } - d_sh_wr += 16 * (4 * d_sh_stride); - } - } - __syncthreads(); - - #pragma unroll - for (int i = 0; - i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); - i++) { - if (d_gl_wr < d_gl_wr_end) { - D[d_gl_wr] = sh[d_sh_rd]; - d_gl_wr += d_gl_wr_delta; - d_sh_rd += d_sh_rd_delta; - } - } - }; - - // Start global fetch and register load pipelines. - auto start_pipes = [&]() { - #pragma unroll - for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters); - zero_accums(); - wait_for_stage(); - fetch_to_registers(0, 0); - a_gl_rd += a_gl_rd_delta_o * (stages - 1); - }; - start_pipes(); - - // Main loop. - while (slice_iters) { - // We unroll over both the global fetch and the register load pipeline to - // ensure all shared memory accesses are static. Note that both pipelines have - // even length meaning that the next iteration will always start at index 0. - #pragma unroll - for (int pipe = 0; pipe < stages;) { - #pragma unroll - for (int k = 0; k < b_sh_wr_iters; k++) { - fetch_to_registers(k + 1, pipe % stages); - if (k == b_sh_wr_iters - 2) { - fetch_to_shared((pipe + stages - 1) % stages, pipe, - slice_iters >= stages); - pipe++; - wait_for_stage(); - } - matmul(k); - } - slice_iters--; - if (slice_iters == 0) break; - } - a_gl_rd += a_gl_rd_delta_o * stages; - - // Process results and, if necessary, proceed to the next column slice. - // While this pattern may not be the most readable, other ways of writing - // the loop seemed to noticeably worse performance after compilation. - if (slice_iters == 0) { - cp_async_wait<0>(); - bool last = slice_idx == slice_count - 1; - // For per-column scales, we only fetch them here in the final step before - // write-out - if (last) { - if (s_tok_sh_wr_pred) { - cp_async1(&sh_s_tok[s_tok_sh_wr], &s_tok[s_tok_gl_rd]); - } - if (s_ch_sh_wr_pred) { - cp_async4(&sh_s_ch[s_ch_sh_wr], &s_ch[s_ch_gl_rd]); - } - cp_async_fence(); - } - thread_block_reduce(); - if (last) { - cp_async_wait<0>(); - __syncthreads(); - if (threadIdx.x / 32 < thread_n_blocks / 4) { - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) { - frag_s_tok[i][0] = - *reinterpret_cast(&sh_s_tok[16 * i + 2 * s_tok_sh_rd]); - frag_s_tok[i][1] = *reinterpret_cast( - &sh_s_tok[16 * i + 2 * s_tok_sh_rd + 1]); - } - reinterpret_cast(&frag_s_ch)[0] = sh_s_ch[s_ch_sh_rd + 0]; - reinterpret_cast(&frag_s_ch)[1] = sh_s_ch[s_ch_sh_rd + 1]; - reinterpret_cast(&frag_s_ch)[2] = sh_s_ch[s_ch_sh_rd + 8]; - reinterpret_cast(&frag_s_ch)[3] = sh_s_ch[s_ch_sh_rd + 9]; - } - } - if (slice_count > 1) { // only globally reduce if there is more than one - // block in a slice - barrier_acquire(&locks[slice_col], slice_idx); - global_reduce(slice_idx == 0, last); - barrier_release(&locks[slice_col], last); - } - if (last) // only the last block in a slice actually writes the result - write_result(); - slice_row = 0; - slice_col_par++; - slice_col++; - init_slice(); - if (slice_iters) { - a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) - B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles; - if (slice_col == 0) { - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride; - } - s_group_gl_rd = s_group_sh_stride * slice_col + threadIdx.x; - s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x; - start_pipes(); - } - } - } -} - -#else - -template shared - // fetch pipeline - const int group_blocks = -1 // number of consecutive 16x16 blocks - // with a separate quantization scale - > -__global__ void Marlin( - const int4* __restrict__ A, // int8 input matrix of shape mxk - const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn - int4* __restrict__ C, // int32 global_reduce buffer of shape - // (max_par*16*4)xn, as int8 tensor core's output is - // int32 dtype - int4* __restrict__ D, // fp16 output buffer of shape mxn - const float* __restrict__ s_tok, // fp32 activation per-token quantization - // scales of shape mx1 - const int4* __restrict__ s_ch, // fp32 weight per-channel quantization - // scales of shape 1xn - const int4* __restrict__ s_group, // fp16 weight per-group quantization - // scales of shape (k/groupsize)xn, when - // group_blocks=-1, it should be nullptr - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int* locks // extra global storage for barrier synchronization -) { - // Marlin is not implemented yet for SM < 8.0 - assert(false); - return; -} - -#endif - -// 8 warps are a good choice since every SM has 4 schedulers and having more -// than 1 warp per schedule allows some more latency hiding. At the same time, -// we want relatively few warps to have many registers per warp and small tiles. -const int USER_THREADS = - 256; // Note: This is only used with user-provided thread_k/n -const int STAGES = 4; // 4 pipeline stages fit into shared memory - -static constexpr int min_thread_n = 64; -static constexpr int min_thread_k = 64; - -static constexpr int tile_size = 16; -static constexpr int max_par = 16; - -static constexpr int pack_factor_4bit = - 8; // We have 8 4-bit vals inside a 32 bit - -#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ - GROUP_BLOCKS, NUM_THREADS) \ - else if (thread_m_blocks == THREAD_M_BLOCKS && \ - thread_n_blocks == THREAD_N_BLOCKS && \ - thread_k_blocks == THREAD_K_BLOCKS && \ - group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) { \ - cudaFuncSetAttribute(Marlin, \ - cudaFuncAttributeMaxDynamicSharedMemorySize, \ - max_shared_mem); \ - Marlin \ - <<>>( \ - A_ptr, B_ptr, C_ptr, D_ptr, s_tok_ptr, s_ch_ptr, s_group_ptr, \ - prob_m, prob_n, prob_k, locks); \ - } - -typedef struct { - int thread_k; - int thread_n; - int num_threads; -} thread_config_t; - -thread_config_t small_batch_thread_configs[] = { - // Ordered by priority - - // thread_k, thread_n, num_threads - {128, 128, 256}, // Default - {128, 64, 128}, // Reduce N 2X, same K - {64, 256, 256}, // Reduce K 2X, increase N 2X - {64, 128, 128}, // Reduce K 2X, same N -}; - -thread_config_t large_batch_thread_configs[] = { - // Ordered by priority - - // thread_k, thread_n, num_threads - {64, 256, 256}, // Default - {128, 128, 256}, // Reduce N 2X, increase K 2X - {64, 128, 128}, // Reduce N 2X, same K - {128, 64, 128}, // Reduce N 4X, increase K 2X -}; - -bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n, - int prob_k) { - // Sanity - if (th_config.thread_k == -1 || th_config.thread_n == -1 || - th_config.num_threads == -1) { - return false; - } - - // Verify K/N are divisible by thread K/N - if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) { - return false; - } - - // thread_k can be only 128 or 64 (because it must be less than groupsize - // which is 128) - if (th_config.thread_k != 128 && th_config.thread_k != 64) { - return false; - } - - // Verify min for thread K/N - if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) { - return false; - } - - // num_threads must be at least 128 (= 4 warps) - if (th_config.num_threads < 128) { - return false; - } - - return true; -} - -thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) { - if (prob_m <= 16) { - for (auto th_config : small_batch_thread_configs) { - if (is_valid_config(th_config, prob_m, prob_n, prob_k)) { - return th_config; - } - } - - } else { - for (auto th_config : large_batch_thread_configs) { - if (is_valid_config(th_config, prob_m, prob_n, prob_k)) { - return th_config; - } - } - } - - return thread_config_t{-1, -1, -1}; -} - -#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) - -void marlin_qqq_cuda(const void* A, const void* B, void* C, void* D, - void* s_tok, void* s_ch, void* s_group, int prob_m, - int prob_n, int prob_k, void* workspace, - int groupsize = -1, int dev = 0, cudaStream_t stream = 0, - int thread_k = -1, int thread_n = -1, int sms = -1, - int max_par = 16) { - int tot_m = prob_m; - int tot_m_blocks = ceildiv(tot_m, 16); - int pad = 16 * tot_m_blocks - tot_m; - - if (sms == -1) - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev); - - int max_shared_mem = 0; - cudaDeviceGetAttribute(&max_shared_mem, - cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); - TORCH_CHECK(max_shared_mem > 0); - - // Set thread config - thread_config_t th_config; - if (thread_k != -1 && thread_n != -1) { - // User-defined config - th_config = thread_config_t{thread_k, thread_n, USER_THREADS}; - } else { - // Auto config - th_config = determine_thread_config(prob_m, prob_n, prob_k); - } - - if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) { - throw std::runtime_error( - "Invalid thread config: thread_k = " + str(th_config.thread_k) + - ", thread_n = " + str(th_config.thread_n) + - ", num_threads = " + str(th_config.num_threads) + " for MKN = [" + - str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]"); - } - - int num_threads = th_config.num_threads; - thread_k = th_config.thread_k; - thread_n = th_config.thread_n; - - int thread_k_blocks = thread_k / 16; - int thread_n_blocks = thread_n / 16; - int group_blocks = (groupsize == -1) ? -1 : groupsize / 16; - int blocks = sms; - - if (prob_m == 0 || prob_n == 0 || prob_k == 0) { - return; - } - - TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n, - " is not divisible by thread_n = ", thread_n); - TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k, - " is not divisible by thread_k = ", thread_k); - if (group_blocks != -1) { - TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k, - " is not divisible by group_blocks = ", group_blocks); - } - - const int4* A_ptr = (const int4*)A; - const int4* B_ptr = (const int4*)B; - int4* C_ptr = (int4*)C; - int4* D_ptr = (int4*)D; - const float* s_tok_ptr = (const float*)s_tok; - const int4* s_ch_ptr = (const int4*)s_ch; - const int4* s_group_ptr = (const int4*)s_group; - - int* locks = (int*)workspace; - - for (int i = 0; i < tot_m_blocks; i += 4) { - int thread_m_blocks = tot_m_blocks - i; - prob_m = tot_m - 16 * i; - int par = 1; - if (thread_m_blocks > 4) { - // Note that parallel > 1 currently only works for inputs without any - // padding - par = (16 * thread_m_blocks - pad) / 64; - if (par > max_par) par = max_par; - prob_m = 64 * par; - i += 4 * (par - 1); - thread_m_blocks = 4; - } - - // For compilation speed, we only define the kernel configurations that have - // seemed useful (in terms of performance) in our testing, however many more - // are, in principle, possible. - if (false) { - } - CALL_IF(8, 8, 256) - CALL_IF(16, 4, 256) - CALL_IF(8, 4, 128) - CALL_IF(4, 8, 128) - else { - throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) + - ", " + str(prob_k) + ", " + str(prob_n) + "]" + - ", groupsize = " + str(groupsize) + - ", thread_m_blocks = " + str(thread_m_blocks) + - ", thread_n_blocks = " + str(thread_n_blocks) + - ", thread_k_blocks = " + str(thread_k_blocks)); - } - - A_ptr += 16 * thread_m_blocks * (prob_k / 16) * par; - D_ptr += 16 * thread_m_blocks * (prob_n / 8) * par; - s_tok_ptr += 16 * thread_m_blocks * par; - } -} -} // anonymous namespace - -torch::Tensor marlin_qqq_gemm(torch::Tensor const& a, - torch::Tensor const& b_q_weight, - torch::Tensor const& s_tok, - torch::Tensor const& s_ch, - torch::Tensor const& s_group, - torch::Tensor& workspace, int64_t size_m, - int64_t size_n, int64_t size_k) { - // Verify M - TORCH_CHECK(size_m == a.size(0), - "Shape mismatch: a.size(0) = " + str(a.size(0)) + - ", size_m = " + str(size_m)); - TORCH_CHECK(size_m == s_tok.numel(), - "Shape mismatch: s_tok.numel() = " + str(s_tok.numel()) + - ", size_m = " + str(size_m)); - - // Verify K - TORCH_CHECK(size_k == a.size(1), - "Shape mismatch: a.size(1) = " + str(a.size(1)) + - ", size_k = " + str(size_k)); - TORCH_CHECK(size_k % tile_size == 0, - "size_k = " + str(size_k) + - " is not divisible by tile_size = " + str(tile_size)); - TORCH_CHECK( - (size_k / tile_size) == b_q_weight.size(0), - "Shape mismatch: b_q_weight.size(0) = " + str(b_q_weight.size(0)) + - ", size_k = " + str(size_k) + ", tile_size = " + str(tile_size)); - - int groupsize = (s_group.numel() == 0) ? -1 : size_k / s_group.size(0); - // Verify groupsize - TORCH_CHECK(groupsize == -1 || groupsize == 128, - "Unexpected groupsize = " + str(groupsize)); - - // Verify N - TORCH_CHECK(s_ch.numel() == size_n, - "Shape mismatch: s_ch.numel() = " + str(s_ch.numel()) + - ", size_n = " + str(size_n)); - TORCH_CHECK(b_q_weight.size(1) % tile_size == 0, - "b_q_weight.size(1) = " + str(b_q_weight.size(1)) + - " is not divisible by tile_size = " + str(tile_size)); - if (groupsize != -1) { - TORCH_CHECK(s_group.size(1) == size_n, - "Shape mismatch: s_group.size(1) = " + str(s_group.size(1)) + - ", size_n = " + str(size_n)); - TORCH_CHECK( - size_k % s_group.size(0) == 0, - "size_k = " + str(size_k) + - ", is not divisible by s_group.size(0) = " + str(s_group.size(0))); - } - - int actual_size_n = (b_q_weight.size(1) / tile_size) * pack_factor_4bit; - TORCH_CHECK(size_n == actual_size_n, - "Shape mismatch: size_n = " + str(size_n) + - ", actual_size_n = " + str(actual_size_n)); - - // Verify A device and strides - TORCH_CHECK(a.device().is_cuda(), "A is not on GPU"); - TORCH_CHECK(a.is_contiguous(), "A is not contiguous"); - - // Verify B device and strides - TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU"); - TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous"); - - // Verify s_tok device, strides and dtype - TORCH_CHECK(s_tok.device().is_cuda(), "s_tok is not on GPU"); - TORCH_CHECK(s_tok.is_contiguous(), "s_tok is not contiguous"); - TORCH_CHECK(s_tok.dtype() == torch::kFloat32, "s_tok's dtype is not float32"); - - // Verify s_ch device, strides and dtype - TORCH_CHECK(s_ch.device().is_cuda(), "s_ch is not on GPU"); - TORCH_CHECK(s_ch.is_contiguous(), "s_ch is not contiguous"); - TORCH_CHECK(s_ch.dtype() == torch::kFloat32, "s_ch's dtype is not float32"); - - // Verify s_group device, strides and dtype - TORCH_CHECK(s_group.device().is_cuda(), "s_group is not on GPU"); - TORCH_CHECK(s_group.is_contiguous(), "s_group is not contiguous"); - TORCH_CHECK(s_group.dtype() == torch::kFloat16, - "s_group's dtype is not float16"); - - // Verify workspace size - TORCH_CHECK(size_n % min_thread_n == 0, - "size_n = " + str(size_n) + - ", is not divisible by min_thread_n = " + str(min_thread_n)); - int min_workspace_size = (size_n / min_thread_n) * max_par; - TORCH_CHECK(workspace.numel() >= min_workspace_size, - "workspace.numel = " + str(workspace.numel()) + - " is below min_workspace_size = " + str(min_workspace_size)); - - // Alloc C matrix - const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); - auto options_c = torch::TensorOptions().dtype(torch::kInt).device(a.device()); - torch::Tensor c = torch::empty({max_par * 64, size_n}, options_c); - - // Alloc D matrix - auto options_d = - torch::TensorOptions().dtype(torch::kFloat16).device(a.device()); - torch::Tensor d = torch::empty({size_m, size_n}, options_d); - - // thread_k: `k` size of a thread_tile in `weights` (can usually be left as - // auto -1) - int thread_k = -1; - // thread_n: `n` size of a thread_tile in `weights` (can usually be left as - // auto -1) - int thread_n = -1; - // sms: number of SMs to use for the kernel (can usually be left as auto -1) - int sms = -1; - - int dev = a.get_device(); - marlin_qqq_cuda( - a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), d.data_ptr(), - s_tok.data_ptr(), s_ch.data_ptr(), s_group.data_ptr(), size_m, size_n, - size_k, workspace.data_ptr(), groupsize, dev, - at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par); - - return d; -} - -TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { - m.impl("marlin_qqq_gemm", &marlin_qqq_gemm); -} diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 3a0ff6eaa7..60710f62c0 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -241,14 +241,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // custom types: // https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA - // Marlin (Dense) Optimized Quantized GEMM for GPTQ. - ops.def( - "marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, " - "Tensor! workspace, SymInt size_m, SymInt size_n, SymInt size_k) -> " - "Tensor", - {stride_tag}); - // conditionally compiled so impl in source file - // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ. ops.def( "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, " @@ -353,15 +345,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("ggml_moe_get_block_size", &ggml_moe_get_block_size); #ifndef USE_ROCM - // marlin_qqq_gemm for QQQ. - ops.def( - "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, " - "Tensor s_tok, Tensor s_ch, Tensor s_group, " - "Tensor! workspace, SymInt size_m, SymInt size_n, " - "SymInt size_k) -> Tensor", - {stride_tag}); - // conditionally compiled so impl registration is in source file - // CUTLASS nvfp4 block scaled GEMM ops.def( "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b," diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index a2fc6ffeb8..84178344a5 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -53,12 +53,6 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None): "quantization": "gptq_marlin_24" })) - if is_quant_method_supported("marlin"): - TEST_MODELS.append( - ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", { - "quantization": "marlin" - })) - if not current_platform.is_rocm() and is_quant_method_supported("awq"): TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", { "quantization": "AWQ" diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py index a842d2f1cb..0e09661c95 100644 --- a/tests/kernels/quantization/test_machete_mm.py +++ b/tests/kernels/quantization/test_machete_mm.py @@ -95,23 +95,23 @@ TEST_TYPES = [ token_scale_type=None) for w_type in [scalar_types.uint4, scalar_types.uint8] for a_type in [torch.float16, torch.bfloat16]), - # QQQ style - *(TypeConfig(act_type=torch.int8, - weight_type=scalar_types.uint4b8, - output_type=torch.float16, - group_scale_type=group_scale_type, - group_zero_type=None, - channel_scale_type=torch.float, - token_scale_type=torch.float) - for group_scale_type in [None, torch.float16]), - *(TypeConfig(act_type=torch.float8_e4m3fn, - weight_type=scalar_types.uint4b8, - output_type=torch.float16, - group_scale_type=group_scale_type, - group_zero_type=None, - channel_scale_type=torch.float, - token_scale_type=torch.float) - for group_scale_type in [None, torch.float16]), + # # QQQ style + # *(TypeConfig(act_type=torch.int8, + # weight_type=scalar_types.uint4b8, + # output_type=torch.float16, + # group_scale_type=group_scale_type, + # group_zero_type=None, + # channel_scale_type=torch.float, + # token_scale_type=torch.float) + # for group_scale_type in [None, torch.float16]), + # *(TypeConfig(act_type=torch.float8_e4m3fn, + # weight_type=scalar_types.uint4b8, + # output_type=torch.float16, + # group_scale_type=group_scale_type, + # group_zero_type=None, + # channel_scale_type=torch.float, + # token_scale_type=torch.float) + # for group_scale_type in [None, torch.float16]), ] # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py index cea7700ac3..ad077e0b94 100644 --- a/tests/kernels/quantization/test_marlin_gemm.py +++ b/tests/kernels/quantization/test_marlin_gemm.py @@ -13,11 +13,7 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES) -from vllm.model_executor.layers.quantization.qqq import ( - MARLIN_QQQ_MAX_PARALLEL, MARLIN_QQQ_MIN_THREAD_N, - MARLIN_QQQ_SUPPORTED_GROUP_SIZES, MARLIN_QQQ_SUPPORTED_NUM_BITS) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( - GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx, marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales, query_marlin_supported_quant_types) @@ -31,8 +27,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( marlin_weights) from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import ( marlin_24_quantize) -from vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq import ( # noqa: E501 - marlin_qqq_quantize) from vllm.model_executor.layers.quantization.utils.quant_utils import ( awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights) from vllm.scalar_type import scalar_types @@ -449,68 +443,6 @@ def test_hqq_marlin_gemm( assert max_diff < 0.04 -@pytest.mark.skipif(not is_quant_method_supported("qqq"), - reason="Marlin is not supported on this GPU type.") -@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS) -@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS) -@pytest.mark.parametrize("num_bits", MARLIN_QQQ_SUPPORTED_NUM_BITS) -@pytest.mark.parametrize("group_size", MARLIN_QQQ_SUPPORTED_GROUP_SIZES) -@pytest.mark.parametrize("mnk_factors", MNK_FACTORS) -def test_marlin_qqq_gemm( - k_chunk, - n_chunk, - num_bits, - group_size, - mnk_factors, -): - int8_traits = torch.iinfo(torch.int8) - m_factor, n_factor, k_factor = mnk_factors - - size_m = m_factor - size_k = k_chunk * k_factor - size_n = n_chunk * n_factor - - a_input = rand_data((size_m, size_k)) - b_weight = rand_data((size_k, size_n)) - - # Quantize activations - s_a = a_input.abs().max(dim=-1, keepdim=True)[0].div(int8_traits.max).to( - torch.float) - q_a = (a_input / s_a).round().clamp(int8_traits.min, - int8_traits.max).to(torch.int8) - - # Quantize weights - w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel = \ - marlin_qqq_quantize(b_weight, num_bits, group_size) - - workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N, - MARLIN_QQQ_MAX_PARALLEL) - - opcheck(torch.ops._C.marlin_qqq_gemm, - (q_a, marlin_qqq_q_w, s_a, marlin_qqq_s_channel, - marlin_qqq_s_group, workspace.scratch, a_input.shape[0], - b_weight.shape[1], a_input.shape[1])) - - output = ops.marlin_qqq_gemm( - q_a, - marlin_qqq_q_w, - s_a, - marlin_qqq_s_channel, - marlin_qqq_s_group, - workspace.scratch, - a_input.shape[0], - b_weight.shape[1], - a_input.shape[1], - ) - output_ref = torch.matmul(q_a.half() * s_a.half(), w_ref) - - torch.cuda.synchronize() - - max_diff = compute_max_diff(output, output_ref) - - assert max_diff < 0.04 - - def test_marlin_gemm_subset_input(): quant_type = scalar_types.uint4b8 group_size = 128 @@ -602,18 +534,3 @@ def test_marlin_gemm_with_bias(size_m): max_diff = compute_max_diff(output, output_ref) assert max_diff < 0.04 - - -def test_marlin_gemm_opcheck(): - size_m = 2048 - size_n = 4096 - size_k = 4096 - a = torch.rand((size_m, size_n), device='cuda', dtype=torch.float16) - w = torch.randint(-5, 5, (256, 8192), device='cuda', dtype=torch.int32) - s = torch.full((32, size_k), 0.125, device='cuda', dtype=torch.float16) - wk = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N, - GPTQ_MARLIN_MAX_PARALLEL).scratch - x = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k) - y = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k) - torch.testing.assert_close(x, y) - opcheck(torch.ops._C.marlin_gemm, (a, w, s, wk, size_m, size_n, size_k)) diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index 8cf8402436..1843bffd21 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -22,22 +22,12 @@ class ModelPair: MODEL_ARG_EXPTYPES = [ # AUTOGPTQ # compat: autogptq <=0.7.1 is_marlin_format: bool - # Model Serialized in Marlin Format should always use Marlin kernel. - ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", None, "marlin"), - ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin", "marlin"), - ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "gptq", "marlin"), - ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "awq", "ERROR"), # Model Serialized in Exllama Format. ("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"), ("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"), ("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"), ("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"), # compat: autogptq >=0.8.0 use checkpoint_format: str - # Model Serialized in Marlin Format should always use Marlin kernel. - ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", None, "marlin"), - ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin", "marlin"), - ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "gptq", "marlin"), - ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "awq", "ERROR"), # Model Serialized in Exllama Format. ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"), ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"), diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index 11f78a23bb..5ec8b27c15 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -11,7 +11,6 @@ import torch from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod from vllm.model_executor.layers.quantization.gptq_marlin import ( GPTQMarlinLinearMethod) -from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod from vllm.model_executor.layers.vocab_parallel_embedding import ( UnquantizedEmbeddingMethod) @@ -19,9 +18,7 @@ PROMPT = "On the surface of Mars, we found" MODELS_QUANT = [ ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", True), - ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", False), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False), - ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False) ] @@ -41,8 +38,7 @@ def test_lm_head( lm_head_layer = model.lm_head if lm_head_quantized: assert isinstance(lm_head_layer.quant_method, - (GPTQLinearMethod, GPTQMarlinLinearMethod, - MarlinLinearMethod)) + (GPTQLinearMethod, GPTQMarlinLinearMethod)) else: assert isinstance(lm_head_layer.quant_method, UnquantizedEmbeddingMethod) diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt index 1b79707409..cc18c9ff1f 100644 --- a/tests/weight_loading/models.txt +++ b/tests/weight_loading/models.txt @@ -26,9 +26,5 @@ compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing awq, casperhansen/mixtral-instruct-awq, main awq_marlin, casperhansen/mixtral-instruct-awq, main fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main -marlin, nm-testing/zephyr-beta-7b-marlin-g128, main -marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main -qqq, HandH1998/QQQ-Llama-3-8b-g128, main -qqq, HandH1998/QQQ-Llama-3-8b, main hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main None, mgleize/fairseq2-dummy-Llama-3.2-1B, main \ No newline at end of file diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 39da08847b..59f2d7737f 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -387,14 +387,6 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor, torch.ops._C.gptq_shuffle(q_weight, q_perm, bit) -# marlin -def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, - b_scales: torch.Tensor, workspace: torch.Tensor, size_m: int, - size_n: int, size_k: int) -> torch.Tensor: - return torch.ops._C.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m, - size_n, size_k) - - # marlin_24 def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, b_meta: torch.Tensor, b_scales: torch.Tensor, @@ -437,25 +429,6 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"): is_zp_float: bool = False) -> torch.Tensor: return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype) - @register_fake("_C::marlin_qqq_gemm") - def _marlin_qqq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor, - s_tok: torch.Tensor, s_ch: torch.Tensor, - s_group: torch.Tensor, workspace: torch.Tensor, - size_m: torch.SymInt, size_n: torch.SymInt, - size_k: torch.SymInt) -> torch.Tensor: - return torch.empty((size_m, size_n), - dtype=torch.float16, - device=a.device) - - @register_fake("_C::marlin_gemm") - def _marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor, - b_scales: torch.Tensor, workspace: torch.Tensor, - size_m: torch.SymInt, size_n: torch.SymInt, - size_k: torch.SymInt) -> torch.Tensor: - return torch.empty((size_m, size_n), - dtype=torch.float16, - device=a.device) - @register_fake("_C::awq_dequantize") def _awq_dequantize_fake(qweight: torch.Tensor, scales: torch.Tensor, zeros: torch.Tensor, split_k_iters: torch.SymInt, @@ -1348,15 +1321,6 @@ def scaled_int8_quant( return output, input_scales, input_azp -# qqq ops -def marlin_qqq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, - s_tok: torch.Tensor, s_ch: torch.Tensor, - s_group: torch.Tensor, workspace: torch.Tensor, - size_m: int, size_n: int, size_k: int) -> torch.Tensor: - return torch.ops._C.marlin_qqq_gemm(a, b_q_weight, s_tok, s_ch, s_group, - workspace, size_m, size_n, size_k) - - # gguf def ggml_dequantize(W: torch.Tensor, quant_type: int, m: int, n: int, dtype: Optional[torch.dtype]) -> torch.Tensor: diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 5b5d477ef0..62dfd4333b 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1112,9 +1112,9 @@ class ModelConfig: def _verify_quantization(self) -> None: supported_quantization = me_quant.QUANTIZATION_METHODS optimized_quantization_methods = [ - "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin", - "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8", - "quark", "modelopt_fp4", "bitblas", "gptq_bitblas", "inc" + "fp8", "modelopt", "gptq_marlin_24", "gptq_marlin", "awq_marlin", + "fbgemm_fp8", "compressed-tensors", "experts_int8", "quark", + "modelopt_fp4", "bitblas", "gptq_bitblas", "inc" ] if self.quantization is not None: self.quantization = cast(me_quant.QuantizationMethods, @@ -1137,7 +1137,6 @@ class ModelConfig: # `override_quantization_method` method) must be checked in order # of preference (this is particularly important for GPTQ). overrides = [ - "marlin", "bitblas", "gptq_marlin_24", "gptq_marlin", diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index de5933d6d4..24a05d310d 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -48,9 +48,6 @@ def _get_lora_device(base_layer: nn.Module) -> torch.device: # GPTQ/AWQ elif hasattr(base_layer, "qweight"): return base_layer.qweight.device - # marlin - elif hasattr(base_layer, "B"): - return base_layer.B.device # HQQ marlin elif hasattr(base_layer, "W_q"): return base_layer.W_q.device diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index d3b6b2089f..654e2ec7b2 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -42,7 +42,6 @@ WEIGHT_LOADER_V2_SUPPORTED = [ "GPTQMarlinLinearMethod", "Fp8LinearMethod", "MarlinLinearMethod", - "QQQLinearMethod", "GPTQMarlin24LinearMethod", "TPUInt8LinearMethod", "GPTQLinearMethod", diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index a4c2671225..ea51468422 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -15,7 +15,6 @@ QuantizationMethods = Literal[ "fbgemm_fp8", "modelopt", "modelopt_fp4", - "marlin", "bitblas", "gguf", "gptq_marlin_24", @@ -25,7 +24,6 @@ QuantizationMethods = Literal[ "gptq", "compressed-tensors", "bitsandbytes", - "qqq", "hqq", "experts_int8", "neuron_quant", @@ -106,13 +104,11 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: from .hqq_marlin import HQQMarlinConfig from .inc import INCConfig from .ipex_quant import IPEXConfig - from .marlin import MarlinConfig from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config from .moe_wna16 import MoeWNA16Config from .mxfp4 import Mxfp4Config from .neuron_quant import NeuronQuantConfig from .ptpc_fp8 import PTPCFp8Config - from .qqq import QQQConfig from .rtn import RTNConfig from .torchao import TorchAOConfig from .tpu_int8 import Int8TpuConfig @@ -125,7 +121,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: "fbgemm_fp8": FBGEMMFp8Config, "modelopt": ModelOptFp8Config, "modelopt_fp4": ModelOptNvFp4Config, - "marlin": MarlinConfig, "bitblas": BitBLASConfig, "gguf": GGUFConfig, "gptq_marlin_24": GPTQMarlin24Config, @@ -136,7 +131,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: "compressed-tensors": CompressedTensorsConfig, "bitsandbytes": BitsAndBytesConfig, "ptpc_fp8": PTPCFp8Config, - "qqq": QQQConfig, "hqq": HQQMarlinConfig, "experts_int8": ExpertsInt8Config, "neuron_quant": NeuronQuantConfig, diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py deleted file mode 100644 index 18d1c13373..0000000000 --- a/vllm/model_executor/layers/quantization/marlin.py +++ /dev/null @@ -1,263 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Any, Optional - -import torch -from torch.nn.parameter import Parameter - -from vllm import _custom_ops as ops -from vllm.logger import init_logger -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase -from vllm.model_executor.layers.quantization import QuantizationMethods -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) -from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -from vllm.model_executor.parameter import (BasevLLMParameter, - ChannelQuantScaleParameter, - GroupQuantScaleParameter, - PackedvLLMParameter) - -logger = init_logger(__name__) - - -class MarlinConfig(QuantizationConfig): - """Config class for Marlin. - - Reference: https://github.com/IST-DASLab/marlin/tree/master - """ - - def __init__( - self, - group_size: int, - lm_head_quantized: bool, - ) -> None: - super().__init__() - - # Group size for the quantization. - self.group_size = group_size - self.lm_head_quantized = lm_head_quantized - if self.group_size != 128 and self.group_size != -1: - raise ValueError( - "Currently, only group size 128 and -1 (channelwise) " - "is supported for Marlin, but got group_size of " - f"{self.group_size}") - - # 4 Bits packed into 32 bit datatype. - self.pack_factor = 32 // 4 - - # Tile size used by marlin kernels. - self.tile_size = 16 - - # Min out_features dim - self.min_n_threads = 64 - - # Min in_features dim - self.min_k_threads = 128 - - # Max parallel problems to solve at once (improves large - # batch performance) - self.max_parallel = 16 - - # Permutation length used by the marlin kernels. - self.perm_len = 1024 - - def __repr__(self) -> str: - return (f"MarlinConfig(group_size={self.group_size}, " - f"lm_head_quantized={self.lm_head_quantized})") - - @classmethod - def get_name(cls) -> QuantizationMethods: - return "marlin" - - @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: - return [torch.half] - - @classmethod - # Need to figure it out - def get_min_capability(cls) -> int: - return 80 - - @classmethod - def get_config_filenames(cls) -> list[str]: - return ["quantize_config.json"] - - @classmethod - def from_config(cls, config: dict[str, Any]) -> "MarlinConfig": - group_size = cls.get_from_keys(config, ["group_size"]) - lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], - default=False) - return cls(group_size, lm_head_quantized) - - @classmethod - def override_quantization_method( - cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: - # compat: autogptq >=0.8.0 use checkpoint_format: str - # compat: autogptq <=0.7.1 is_marlin_format: bool - is_marlin_format = (hf_quant_cfg.get("checkpoint_format") == "marlin" - or hf_quant_cfg.get("is_marlin_format", False)) - - is_valid_user_quant = (user_quant is None or user_quant == "gptq" - or user_quant == "marlin") - - if is_marlin_format and is_valid_user_quant: - msg = ("The model is serialized in {} format. Using {} kernel.". - format(cls.get_name(), cls.get_name())) - logger.info(msg) - return cls.get_name() - - return None - - def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["MarlinLinearMethod"]: - if (isinstance(layer, LinearBase) or - (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)): - return MarlinLinearMethod(self) - return None - - -class MarlinLinearMethod(LinearMethodBase): - """Linear method for Marlin. - - Args: - quant_config: The Marlin quantization config. - """ - - def __init__(self, quant_config: MarlinConfig): - self.quant_config = quant_config - - def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: list[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - del output_size # Unused. - weight_loader = extra_weight_attrs["weight_loader"] - - if params_dtype != torch.float16: - raise ValueError( - f"The params dtype must be float16, but got {params_dtype}") - - # Validate output_size_per_partition - output_size_per_partition = sum(output_partition_sizes) - if output_size_per_partition % self.quant_config.min_n_threads != 0: - raise ValueError( - f"Weight output_size_per_partition = " - f"{output_size_per_partition} is not divisible by " - f"min_n_threads = {self.quant_config.min_n_threads}.") - if output_size_per_partition % self.quant_config.pack_factor != 0: - raise ValueError( - f"Weight output_size_per_partition = " - f"{output_size_per_partition} is not divisible by " - f"pack_factor = {self.quant_config.pack_factor}.") - - # Validate input_size_per_partition - if input_size_per_partition % self.quant_config.min_k_threads != 0: - raise ValueError( - f"Weight input_size_per_partition = " - f"{input_size_per_partition} is not divisible by " - f"min_k_threads = {self.quant_config.min_k_threads}.") - if (self.quant_config.group_size != -1 and - input_size_per_partition % self.quant_config.group_size != 0): - raise ValueError(f"Weight input_size_per_partition = " - f"{input_size_per_partition} is not divisible by " - f"group_size = {self.quant_config.group_size}.") - - # Check that we have at least 4 tiles horizontally in the shard - num_tiles_per_perm = self.quant_config.perm_len // ( - self.quant_config.tile_size**2) - if output_size_per_partition % num_tiles_per_perm != 0: - raise ValueError( - "Each permutation group must reside on the same gpu") - - # Quantized 4Bit weights packed into Int32. - qweight = PackedvLLMParameter( - data=torch.empty( - input_size_per_partition // self.quant_config.tile_size, - output_size_per_partition * self.quant_config.tile_size // - self.quant_config.pack_factor, - device="cuda", - dtype=torch.int32, - ), - input_dim=0, - output_dim=1, - packed_dim=1, - packed_factor=self.quant_config.pack_factor, - marlin_tile_size=self.quant_config.tile_size, - weight_loader=weight_loader) - - # Determine if channelwise or not - input_groups = (1 if self.quant_config.group_size == -1 else - input_size_per_partition // - self.quant_config.group_size) - - weight_scale_args = { - "data": - torch.empty( - input_groups, - output_size_per_partition, - device="cuda", - dtype=params_dtype, - ), - "weight_loader": - weight_loader - } - if input_groups == 1: - scales = ChannelQuantScaleParameter(output_dim=1, - **weight_scale_args) - else: - scales = GroupQuantScaleParameter(output_dim=1, - input_dim=0, - **weight_scale_args) - - # Allocate workspace (Used for internal locking mechanism) - max_workspace_size = ( - output_size_per_partition // - self.quant_config.min_n_threads) * self.quant_config.max_parallel - - workspace = BasevLLMParameter(data=torch.zeros(max_workspace_size, - device="cuda", - dtype=torch.int), - weight_loader=weight_loader) - - layer.register_parameter("B", qweight) - layer.register_parameter("s", scales) - layer.register_parameter("workspace", workspace) - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - # required by torch.compile - layer.B = Parameter(layer.B.data, requires_grad=False) - layer.s = Parameter(layer.s.data, requires_grad=False) - layer.workspace = Parameter(layer.workspace.data, requires_grad=False) - - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - qweight = layer.B - scales = layer.s - workspace = layer.workspace - - x_2d = x.view(-1, x.shape[-1]) - - size_m = x_2d.shape[0] - size_k = x_2d.shape[1] - size_n = scales.shape[1] - - output_2d = ops.marlin_gemm(x_2d, qweight, scales, workspace, size_m, - size_n, size_k) - - output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], )) - - if bias is not None: - output.add_(bias) # In-place add - - return output diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py deleted file mode 100644 index 25978cb13b..0000000000 --- a/vllm/model_executor/layers/quantization/qqq.py +++ /dev/null @@ -1,275 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Any, Optional - -import torch -from torch.nn.parameter import Parameter - -from vllm import _custom_ops as ops -from vllm.logger import init_logger -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase -from vllm.model_executor.layers.quantization import QuantizationMethods -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) -from vllm.model_executor.parameter import (BasevLLMParameter, - ChannelQuantScaleParameter, - GroupQuantScaleParameter, - PackedvLLMParameter) - -logger = init_logger(__name__) - -MARLIN_QQQ_TILE = 16 -MARLIN_QQQ_MIN_THREAD_N = 64 -MARLIN_QQQ_MIN_THREAD_K = 128 -MARLIN_QQQ_MAX_PARALLEL = 16 - -MARLIN_QQQ_SUPPORTED_NUM_BITS = [4] -MARLIN_QQQ_SUPPORTED_GROUP_SIZES = [-1, 128] -MARLIN_QQQ_SUPPORTED_SYM = [True] - - -class QQQConfig(QuantizationConfig): - """Config class for QQQ - - Reference: https://arxiv.org/pdf/2406.09904 - """ - - def __init__( - self, - weight_bits: int, - group_size: int, - is_sym: bool = True, - ) -> None: - super().__init__() - self.weight_bits = weight_bits - self.group_size = group_size - self.is_sym = is_sym - - # Verify - if self.weight_bits not in MARLIN_QQQ_SUPPORTED_NUM_BITS: - raise ValueError( - f"QQQ does not support weight_bits = {self.weight_bits}. " - f"Only weight_bits = {MARLIN_QQQ_SUPPORTED_NUM_BITS} " - "are supported.") - if self.group_size not in MARLIN_QQQ_SUPPORTED_GROUP_SIZES: - raise ValueError( - f"QQQ does not support group_size = {self.group_size}. " - f"Only group_sizes = {MARLIN_QQQ_SUPPORTED_GROUP_SIZES} " - "are supported.") - if self.is_sym not in MARLIN_QQQ_SUPPORTED_SYM: - raise ValueError( - f"QQQ does not support is_sym = {self.is_sym}. " - f"Only sym = {MARLIN_QQQ_SUPPORTED_SYM} are supported.") - - # 4 Bits packed into 32 bit datatype. - self.pack_factor = 32 // self.weight_bits - - # Tile size used by QQQ kernels. - self.tile_size = MARLIN_QQQ_TILE - - # Min out_features dim - self.min_n_threads = MARLIN_QQQ_MIN_THREAD_N - - # Min in_features dim - self.min_k_threads = MARLIN_QQQ_MIN_THREAD_K - - # Max parallel problems to solve at once (improves large - # batch performance) - self.max_parallel = MARLIN_QQQ_MAX_PARALLEL - - # Permutation length used by the QQQ kernels. - self.perm_len = 1024 - - def __repr__(self) -> str: - return "QQQConfig(weight_bits={}, group_size={})".format( - self.weight_bits, self.group_size) - - @classmethod - def get_name(cls) -> QuantizationMethods: - return "qqq" - - @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: - return [torch.half] - - @classmethod - def get_min_capability(cls) -> int: - return 80 - - @classmethod - def get_config_filenames(cls) -> list[str]: - """List of filenames to search for in the model directory.""" - return [ - "quant_config.json", - "quantize_config.json", - ] - - @classmethod - def from_config(cls, config: dict[str, Any]) -> "QQQConfig": - weight_bits = cls.get_from_keys(config, ["wbits"]) - group_size = cls.get_from_keys(config, ["group_size"]) - return cls(weight_bits, group_size) - - def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["QQQLinearMethod"]: - if isinstance(layer, LinearBase): - return QQQLinearMethod(self) - return None - - -class QQQLinearMethod(LinearMethodBase): - """Linear method for QQQ. - - Args: - quant_config: The QQQ quantization config. - """ - - def __init__(self, quant_config: QQQConfig): - self.quant_config = quant_config - - def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: list[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - weight_loader = extra_weight_attrs["weight_loader"] - if params_dtype != torch.float16: - raise ValueError( - f"The params dtype must be float16, but got {params_dtype}") - - # Validate output_size_per_partition - output_size_per_partition = sum(output_partition_sizes) - if output_size_per_partition % self.quant_config.min_n_threads != 0: - raise ValueError( - f"Weight output_size_per_partition = " - f"{output_size_per_partition} is not divisible by " - f"min_n_threads = {self.quant_config.min_n_threads}.") - if output_size_per_partition % self.quant_config.pack_factor != 0: - raise ValueError( - f"Weight output_size_per_partition = " - f"{output_size_per_partition} is not divisible by " - f"pack_factor = {self.quant_config.pack_factor}.") - - # Validate input_size_per_partition - if input_size_per_partition % self.quant_config.min_k_threads != 0: - raise ValueError( - f"Weight input_size_per_partition = " - f"{input_size_per_partition} is not divisible by " - f"min_k_threads = {self.quant_config.min_k_threads}.") - if (self.quant_config.group_size != -1 and - input_size_per_partition % self.quant_config.group_size != 0): - raise ValueError(f"Weight input_size_per_partition = " - f"{input_size_per_partition} is not divisible by " - f"group_size = {self.quant_config.group_size}.") - - # Check that we have at least 4 tiles horizontally in the shard - num_tiles_per_perm = self.quant_config.perm_len // ( - self.quant_config.tile_size**2) - if output_size_per_partition % num_tiles_per_perm != 0: - raise ValueError( - "Each permutation group must reside on the same gpu") - - # Quantized 4Bit weights packed into Int32. - qweight = PackedvLLMParameter( - data=torch.empty( - input_size_per_partition // self.quant_config.tile_size, - output_size_per_partition * self.quant_config.tile_size // - self.quant_config.pack_factor, - device="cuda", - dtype=torch.int32, - ), - input_dim=0, - output_dim=1, - packed_dim=1, - packed_factor=self.quant_config.pack_factor, - marlin_tile_size=self.quant_config.tile_size, - weight_loader=weight_loader) - - s_channel = ChannelQuantScaleParameter(data=torch.empty( - 1, - output_size_per_partition, - device="cuda", - dtype=torch.float, - ), - weight_loader=weight_loader, - output_dim=1) - - if self.quant_config.group_size == -1: - s_group_data = torch.tensor( - [], - device="cuda", - dtype=torch.half, - ) - else: - s_group_data = torch.empty( - input_size_per_partition // self.quant_config.group_size, - output_size_per_partition, - device="cuda", - dtype=torch.half, - ) - - s_group_attr = {"data": s_group_data, "weight_loader": weight_loader} - - if self.quant_config.group_size == -1: - s_group = BasevLLMParameter(**s_group_attr) - else: - s_group = GroupQuantScaleParameter(output_dim=1, - input_dim=0, - **s_group_attr) - - # Allocate workspace (Used for internal locking mechanism) - max_workspace_size = ( - output_size_per_partition // - self.quant_config.min_n_threads) * self.quant_config.max_parallel - - workspace = BasevLLMParameter(data=torch.zeros(max_workspace_size, - device="cuda", - dtype=torch.int), - weight_loader=weight_loader) - - layer.register_parameter("B", qweight) - layer.register_parameter("s_channel", s_channel) - layer.register_parameter("s_group", s_group) - layer.register_parameter("workspace", workspace) - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - # required by torch.compile - layer.B = Parameter(layer.B.data, requires_grad=False) - layer.s_channel = Parameter(layer.s_channel.data, requires_grad=False) - layer.s_group = Parameter(layer.s_group.data, requires_grad=False) - layer.workspace = Parameter(layer.workspace.data, requires_grad=False) - - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - qweight = layer.B - s_ch = layer.s_channel - s_group = layer.s_group - workspace = layer.workspace - - x_2d = x.view(-1, x.shape[-1]) - - size_m = x_2d.shape[0] - size_k = x_2d.shape[1] - size_n = s_ch.shape[1] - - x_int8, s_tok, _ = ops.scaled_int8_quant(x_2d) - - output_2d = ops.marlin_qqq_gemm(x_int8, qweight, s_tok, s_ch, s_group, - workspace, size_m, size_n, size_k) - - output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], )) - - if bias is not None: - output.add_(bias) # In-place add - - return output diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py deleted file mode 100644 index 8a64bebae0..0000000000 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +++ /dev/null @@ -1,126 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import numpy -import torch - -from .marlin_utils_test import marlin_permute_weights -from .quant_utils import get_pack_factor, qqq_quantize_weights - - -def marlin_qqq_weights(q_w, size_k, size_n, num_bits, perm, group_size): - # Permute - q_w = marlin_permute_weights(q_w, size_k, size_n, perm) - - # Pack - pack_factor = get_pack_factor(num_bits) - orig_device = q_w.device - - q_w = q_w.cpu().numpy().astype(numpy.uint32) - - q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor), - dtype=numpy.uint32) - if group_size == size_k: - for i in range(pack_factor): - q_packed |= (q_w[:, i::pack_factor] & 0xF) << num_bits * i - else: - for i in range(pack_factor): - q_packed |= q_w[:, i::pack_factor] << num_bits * i - - q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device) - - return q_packed - - -def get_qqq_scale_perms(): - scale_perm: list[int] = [] - for i in range(8): - scale_perm.extend([i + 8 * j for j in range(8)]) - scale_perm_single: list[int] = [] - for i in range(4): - scale_perm_single.extend( - [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) - return scale_perm, scale_perm_single - - -# NOTE(HandH1998): QQQ employs different perms for per-group and per-channel weight quantization. # noqa: E501 -def get_qqq_weight_perm(num_bits: int, quant_type: str): - perm_list: list[int] = [] - for i in range(32): - perm1: list[int] = [] - col = i // 4 - for block in [0, 1]: - for row in [ - 4 * (i % 4), - 4 * (i % 4) + 1, - 4 * (i % 4) + 2, - 4 * (i % 4) + 3, - ]: - perm1.append(16 * row + col + 8 * block) - for j in range(4): - perm_list.extend([p + 256 * j for p in perm1]) - - perm = numpy.array(perm_list) - - assert quant_type in ["per-channel", - "per-group"], "not supported quantization type" - if num_bits == 4: - if quant_type == "per-channel": - interleave = numpy.array([4, 0, 5, 1, 6, 2, 7, 3]) - else: - interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7]) - else: - raise Exception("num_bits must be 4, got {}".format(num_bits)) - - perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel() - perm = torch.from_numpy(perm) - return perm - - -def marlin_qqq_permute_scales(s_group, s_channel, size_k, size_n, group_size): - scale_perm, scale_perm_single = get_qqq_scale_perms() - if group_size < size_k and group_size != -1: - s_group = s_group.reshape((-1, len(scale_perm)))[:, scale_perm] - s_channel = s_channel.reshape( - (-1, len(scale_perm_single)))[:, scale_perm_single] - s_group = s_group.reshape((-1, size_n)).contiguous() - else: - s_channel = s_channel.reshape( - (-1, len(scale_perm_single)))[:, scale_perm_single] - s_channel = s_channel.reshape((-1, size_n)).contiguous() - - return s_group, s_channel - - -def marlin_qqq_quantize( - w: torch.Tensor, - num_bits: int, - group_size: int, -): - size_k, size_n = w.shape - - # Normalize group_size - if group_size == -1: - group_size = size_k - assert group_size <= size_k - quant_type = "per-channel" if group_size == size_k else "per-group" - - # Quantize - w_ref, q_w, s_group, s_channel = qqq_quantize_weights( - w, num_bits, group_size) - - # Reformat to marlin_qqq - weight_perm = get_qqq_weight_perm(num_bits, quant_type) - marlin_qqq_q_w = marlin_qqq_weights(q_w, size_k, size_n, num_bits, - weight_perm, group_size) - marlin_qqq_s_group, marlin_qqq_s_channel = marlin_qqq_permute_scales( - s_group, s_channel, size_k, size_n, group_size) - - # Create result - res_list = [ - w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel - ] - for i in range(len(res_list)): - res_list[i] = res_list[i].to(w.device) - - return res_list diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index 428e9e99aa..3cfaca6230 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -9,8 +9,6 @@ import numpy import torch from vllm._custom_ops import cutlass_scaled_mm_supports_fp4 -from vllm.model_executor.layers.quantization.qqq import ( - MARLIN_QQQ_SUPPORTED_NUM_BITS) from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types @@ -386,89 +384,6 @@ def gptq_quantize_weights(w: torch.Tensor, return w_ref, w_q, w_s, g_idx, rand_perm -# QQQ employs different quant schemes for per-group and -# per-channel quantization. -def qqq_quantize_weights(w: torch.Tensor, num_bits: int, group_size: int): - orig_device = w.device - size_k, size_n = w.shape - - assert w.is_floating_point(), "w must be float" - assert num_bits in MARLIN_QQQ_SUPPORTED_NUM_BITS, \ - f"Unsupported num_bits = {num_bits}" - assert group_size in SUPPORTED_GROUP_SIZES + [ - size_k - ], f"Unsupported groupsize = {group_size}" - - if group_size == -1: - group_size = size_k - assert group_size <= size_k - - if group_size < size_k: - # Reshape to [groupsize, -1] - w = w.reshape((-1, group_size, size_n)) - w = w.permute(1, 0, 2) - w = w.reshape((group_size, -1)) - - max_q_val = 2**num_bits - 1 - half_q_val = (max_q_val + 1) // 2 - - # Compute scale for each group - s_group = torch.max(torch.abs(w), 0, keepdim=True)[0] - s_group *= 2 / max_q_val # 2 => symmetric - - # Quantize - q_w = torch.round(w / s_group).int() - q_w += half_q_val - q_w = torch.clamp(q_w, 0, max_q_val) - # Compute ref (dequantized) - w_ref = (q_w - half_q_val).half() * s_group - - # Restore original shapes - def reshape_w(w): - w = w.reshape((group_size, -1, size_n)) - w = w.permute(1, 0, 2) - w = w.reshape((size_k, size_n)).contiguous() - return w - - q_w = reshape_w(q_w) - w_ref = reshape_w(w_ref) - - # Compute int8 quantization scale for each channel - s_channel = torch.max(torch.abs(w_ref), 0, keepdim=True)[0] - s_channel /= 127.0 - t_int8 = (w_ref / s_channel).round().clamp(-128, 127).to(torch.int8) - w_ref = t_int8.half() * s_channel - s_channel = s_channel.reshape(1, -1).to(dtype=torch.float) - - # Fuse scales - s_group = (s_group.reshape(-1, size_n).contiguous() / - s_channel).to(dtype=torch.half) - else: - max_q_val = 2**(num_bits - 1) - 1 - - # Compute scale for each channel - s_channel = torch.max(torch.abs(w), 0, keepdim=True)[0] - s_channel /= max_q_val - - # Quantize - q_w = torch.round(w / s_channel).int() - q_w = torch.clamp(q_w, -max_q_val, max_q_val) - # Compute ref (dequantized) - w_ref = q_w.half() * s_channel - - s_group = torch.tensor([], dtype=torch.half) - # div 2 ** (8 - self.bits)) to offset right shift in unpacking - s_channel /= (2**(8 - num_bits)) - s_channel = s_channel.reshape(-1, size_n).contiguous().to(torch.float) - - return ( - w_ref.to(device=orig_device), - q_w.to(device=orig_device), - s_group.to(device=orig_device), - s_channel.to(device=orig_device), - ) - - def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor): orig_device = q_w.device From 582bbe6bd708d01d74d6d02d6ef59b4c3c34a7b1 Mon Sep 17 00:00:00 2001 From: bigmoyan Date: Thu, 21 Aug 2025 03:59:54 +0800 Subject: [PATCH 440/932] [Fix] correct tool_id for kimi-k2 when use tool_choice=required (#21259) Co-authored-by: wangzhengtao --- .../test_completion_with_function_calling.py | 314 +++++++++++------- tests/utils.py | 10 +- vllm/entrypoints/chat_utils.py | 17 +- vllm/entrypoints/openai/protocol.py | 4 +- vllm/entrypoints/openai/serving_chat.py | 64 +++- .../tool_parsers/deepseekv3_tool_parser.py | 4 +- .../granite_20b_fc_tool_parser.py | 4 +- .../tool_parsers/granite_tool_parser.py | 4 +- .../openai/tool_parsers/hermes_tool_parser.py | 4 +- .../tool_parsers/internlm2_tool_parser.py | 4 +- .../openai/tool_parsers/jamba_tool_parser.py | 4 +- .../openai/tool_parsers/llama_tool_parser.py | 4 +- .../tool_parsers/minimax_tool_parser.py | 4 +- .../tool_parsers/phi4mini_tool_parser.py | 4 +- .../openai/tool_parsers/xlam_tool_parser.py | 4 +- 15 files changed, 283 insertions(+), 166 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index a5b081f861..4ef5d4e8a6 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -13,6 +13,127 @@ from ...utils import RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "Qwen/Qwen3-0.6B" +tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": + "The city to find the weather for, e.g. 'Vienna'", + "default": "Vienna", + }, + "country": { + "type": + "string", + "description": + "The country that the city is in, e.g. 'Austria'", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, + "options": { + "$ref": "#/$defs/WeatherOptions", + "description": "Optional parameters for weather query", + }, + }, + "required": ["country", "unit"], + "$defs": { + "WeatherOptions": { + "title": "WeatherOptions", + "type": "object", + "additionalProperties": False, + "properties": { + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "default": "celsius", + "description": "Temperature unit", + "title": "Temperature Unit", + }, + "include_forecast": { + "type": "boolean", + "default": False, + "description": + "Whether to include a 24-hour forecast", + "title": "Include Forecast", + }, + "language": { + "type": "string", + "default": "zh-CN", + "description": "Language of the response", + "title": "Language", + "enum": ["zh-CN", "en-US", "ja-JP"], + }, + }, + }, + }, + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_forecast", + "description": "Get the weather forecast for a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": + "The city to get the forecast for, e.g. 'Vienna'", + "default": "Vienna", + }, + "country": { + "type": + "string", + "description": + "The country that the city is in, e.g. 'Austria'", + }, + "days": { + "type": + "integer", + "description": + "Number of days to get the forecast for (1-7)", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, + }, + "required": ["country", "days", "unit"], + }, + }, + }, +] + +messages = [ + { + "role": "user", + "content": "Hi! How are you doing today?" + }, + { + "role": "assistant", + "content": "I'm doing well! How can I help you?" + }, + { + "role": + "user", + "content": + "Can you tell me what the current weather is in Berlin and the "\ + "forecast for the next 5 days, in fahrenheit?", + }, +] + @pytest.fixture(scope="module") def server(): # noqa: F811 @@ -27,6 +148,8 @@ def server(): # noqa: F811 "hermes", "--reasoning-parser", "qwen3", + "--gpu-memory-utilization", + "0.4" ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -54,129 +177,6 @@ async def client(server): async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str, stream: bool, tool_choice: Union[str, dict], enable_thinking: bool): - tools = [ - { - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "city": { - "type": "string", - "description": - "The city to find the weather for, e.g. 'Vienna'", - "default": "Vienna", - }, - "country": { - "type": - "string", - "description": - "The country that the city is in, e.g. 'Austria'", - }, - "unit": { - "type": "string", - "description": - "The unit to fetch the temperature in", - "enum": ["celsius", "fahrenheit"], - }, - "options": { - "$ref": "#/$defs/WeatherOptions", - "description": - "Optional parameters for weather query", - }, - }, - "required": ["country", "unit"], - "$defs": { - "WeatherOptions": { - "title": "WeatherOptions", - "type": "object", - "additionalProperties": False, - "properties": { - "unit": { - "type": "string", - "enum": ["celsius", "fahrenheit"], - "default": "celsius", - "description": "Temperature unit", - "title": "Temperature Unit", - }, - "include_forecast": { - "type": "boolean", - "default": False, - "description": - "Whether to include a 24-hour forecast", - "title": "Include Forecast", - }, - "language": { - "type": "string", - "default": "zh-CN", - "description": "Language of the response", - "title": "Language", - "enum": ["zh-CN", "en-US", "ja-JP"], - }, - }, - }, - }, - }, - }, - }, - { - "type": "function", - "function": { - "name": "get_forecast", - "description": "Get the weather forecast for a given location", - "parameters": { - "type": "object", - "properties": { - "city": { - "type": "string", - "description": - "The city to get the forecast for, e.g. 'Vienna'", - "default": "Vienna", - }, - "country": { - "type": - "string", - "description": - "The country that the city is in, e.g. 'Austria'", - }, - "days": { - "type": - "integer", - "description": - "Number of days to get the forecast for (1-7)", - }, - "unit": { - "type": "string", - "description": - "The unit to fetch the temperature in", - "enum": ["celsius", "fahrenheit"], - }, - }, - "required": ["country", "days", "unit"], - }, - }, - }, - ] - - messages = [ - { - "role": "user", - "content": "Hi! How are you doing today?" - }, - { - "role": "assistant", - "content": "I'm doing well! How can I help you?" - }, - { - "role": - "user", - "content": - "Can you tell me what the current weather is in Berlin and the "\ - "forecast for the next 5 days, in fahrenheit?", - }, - ] if not stream: # Non-streaming test chat_completion = await client.chat.completions.create( @@ -216,3 +216,71 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str, output.extend(chunk.choices[0].delta.tool_calls) assert len(output) > 0 + + +@pytest.fixture(scope="module") +def k2_server(): # noqa: F811 + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "half", + "--enable-auto-tool-choice", + "--guided-decoding-backend", + "xgrammar", + "--tool-call-parser", + "hermes", + "--reasoning-parser", + "qwen3", + "--gpu-memory-utilization", + "0.4", + ] + # hack to test kimi_k2 tool use tool_id format. + # avoid error in is_deepseek_mla check by setting kv_lora_rank=null + with RemoteOpenAIServer(MODEL_NAME, + args, + override_hf_configs={ + "model_type": 'kimi_k2', + 'kv_lora_rank': None + }) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def k2_client(k2_server): + async with k2_server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("stream", [True, False]) +@pytest.mark.parametrize("tool_choice", ["required"]) +async def test_tool_id_kimi_k2(k2_client: openai.AsyncOpenAI, model_name: str, + stream: bool, tool_choice: str): + + if not stream: + # Non-streaming test + chat_completion = await k2_client.chat.completions.create( + messages=messages, + model=model_name, + tools=tools, + tool_choice=tool_choice) + assert chat_completion.choices[0].message.tool_calls is not None + assert len(chat_completion.choices[0].message.tool_calls) > 0 + assert chat_completion.choices[0].message.tool_calls[ + 0].id == 'functions.get_current_weather:0' + else: + # Streaming test + output_stream = await k2_client.chat.completions.create( + messages=messages, + model=model_name, + tools=tools, + tool_choice=tool_choice, + stream=True) + + output = [] + async for chunk in output_stream: + if chunk.choices and chunk.choices[0].delta.tool_calls: + output.extend(chunk.choices[0].delta.tool_calls) + for o in output: + assert o.id is None or o.id == 'functions.get_current_weather:0' diff --git a/tests/utils.py b/tests/utils.py index e98707fb44..4dba549466 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -5,6 +5,7 @@ import asyncio import copy import functools import importlib +import json import os import signal import subprocess @@ -101,7 +102,8 @@ class RemoteOpenAIServer: env_dict: Optional[dict[str, str]] = None, seed: Optional[int] = 0, auto_port: bool = True, - max_wait_seconds: Optional[float] = None) -> None: + max_wait_seconds: Optional[float] = None, + override_hf_configs: Optional[dict[str, Any]] = None) -> None: if auto_port: if "-p" in vllm_serve_args or "--port" in vllm_serve_args: raise ValueError("You have manually specified the port " @@ -120,6 +122,12 @@ class RemoteOpenAIServer: vllm_serve_args = vllm_serve_args + ["--seed", str(seed)] + if override_hf_configs is not None: + vllm_serve_args = vllm_serve_args + [ + "--hf-overrides", + json.dumps(override_hf_configs) + ] + parser = FlexibleArgumentParser( description="vLLM's remote OpenAI server.") subparsers = parser.add_subparsers(required=False, dest="subparser") diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 74c8093f49..87772a499f 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1345,5 +1345,18 @@ def apply_mistral_chat_template( "template") raise ValueError(str(e)) from e -def random_tool_call_id() -> str: - return f"chatcmpl-tool-{random_uuid()}" +def get_history_tool_calls_cnt(conversation: list[ConversationMessage]): + idx = 0 + for msg in conversation: + if msg['role'] == 'assistant': + tool_calls = msg.get('tool_calls') + idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa + return idx + +def make_tool_call_id(id_type:str='random', func_name=None, idx=None): + + if id_type=='kimi_k2': + return f'functions.{func_name}:{idx}' + else: + # by default return random + return f"chatcmpl-tool-{random_uuid()}" diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 39facd4d53..a44868973f 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -38,7 +38,7 @@ from typing_extensions import TypeAlias from vllm import envs from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, - random_tool_call_id) + make_tool_call_id) from vllm.entrypoints.score_utils import (ScoreContentPartParam, ScoreMultiModalParam) from vllm.logger import init_logger @@ -1634,7 +1634,7 @@ class FunctionCall(OpenAIBaseModel): class ToolCall(OpenAIBaseModel): - id: str = Field(default_factory=random_tool_call_id) + id: str = Field(default_factory=make_tool_call_id) type: Literal["function"] = "function" function: FunctionCall diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index d57868847e..65aac23ee6 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -19,7 +19,8 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, ConversationMessage, - random_tool_call_id) + get_history_tool_calls_cnt, + make_tool_call_id) from vllm.entrypoints.harmony_utils import ( get_developer_message, get_stop_tokens_for_assistant_actions, get_streamable_parser_for_assistant, get_system_message, parse_chat_input, @@ -133,6 +134,10 @@ class OpenAIServingChat(OpenAIServing): source = "model" if source == "auto" else source logger.info("Using default chat sampling params from %s: %s", source, self.default_sampling_params) + if self.model_config.hf_config.model_type == 'kimi_k2': + self.tool_call_id_type = 'kimi_k2' + else: + self.tool_call_id_type = 'random' self.use_harmony = model_config.hf_config.model_type == "gpt_oss" if self.use_harmony: @@ -379,6 +384,7 @@ class OpenAIServingChat(OpenAIServing): current_text: Optional[str], delta_text: str, function_name_returned: bool, + tool_call_idx: Optional[int] = None ) -> tuple[Optional[DeltaMessage], bool]: if current_text is None or current_text == "": # if the current text is empty, we cannot parse it @@ -424,8 +430,12 @@ class OpenAIServingChat(OpenAIServing): current_tool_call = obj[-2] function_name_returned = True + tool_call_id = make_tool_call_id( + id_type=self.tool_call_id_type, + func_name=current_tool_call["name"], + idx=tool_call_idx) delta_message = DeltaMessage(tool_calls=[ - DeltaToolCall(id=random_tool_call_id(), + DeltaToolCall(id=tool_call_id, function=DeltaFunctionCall( name=current_tool_call["name"], arguments=arguments), @@ -491,6 +501,10 @@ class OpenAIServingChat(OpenAIServing): all_previous_token_ids: Optional[list[list[int]]] function_name_returned = [False] * num_choices + if self.tool_call_id_type == 'kimi_k2': + history_tool_call_cnt = get_history_tool_calls_cnt(conversation) + else: + history_tool_call_cnt = 0 # Always track previous_texts for comprehensive output logging previous_texts = [""] * num_choices @@ -673,7 +687,6 @@ class OpenAIServingChat(OpenAIServing): previous_text = previous_texts[i] previous_token_ids = all_previous_token_ids[i] current_text = previous_text + delta_text - # avoid the None + list error. if previous_token_ids: current_token_ids = previous_token_ids + as_list( @@ -733,7 +746,7 @@ class OpenAIServingChat(OpenAIServing): index=i) else: delta_tool_call = DeltaToolCall( - id=random_tool_call_id(), + id=make_tool_call_id(), type="function", function=DeltaFunctionCall( name=tool_choice_function_name, @@ -764,7 +777,11 @@ class OpenAIServingChat(OpenAIServing): previous_text=previous_text, current_text=content, delta_text=delta_text, - function_name_returned=fn_name_returned)) + function_name_returned=fn_name_returned, + tool_call_idx=history_tool_call_cnt)) + if (delta_message and delta_message.tool_calls and + delta_message.tool_calls[0].id is not None): + history_tool_call_cnt += 1 # update the previous values for the next iteration previous_texts[i] = current_text @@ -1089,6 +1106,10 @@ class OpenAIServingChat(OpenAIServing): assert final_res is not None choices: list[ChatCompletionResponseChoice] = [] + if self.tool_call_id_type == 'kimi_k2': + history_tool_call_cnt = get_history_tool_calls_cnt(conversation) + else: + history_tool_call_cnt = 0 role = self.get_chat_request_role(request) for output in final_res.outputs: @@ -1194,17 +1215,26 @@ class OpenAIServingChat(OpenAIServing): assert content is not None tool_calls = TypeAdapter( list[FunctionDefinition]).validate_json(content) + tool_call_ids = [] + for tool_call in tool_calls: + tool_call_ids.append( + make_tool_call_id(id_type=self.tool_call_id_type, + func_name=tool_call.name, + idx=history_tool_call_cnt)) + history_tool_call_cnt += 1 message = ChatMessage( role=role, content="", - reasoning_content=reasoning_content, tool_calls=[ - tool_call_class(function=FunctionCall( - name=tool_call.name, - arguments=json.dumps(tool_call.parameters, - ensure_ascii=False))) - for tool_call in tool_calls - ]) + tool_call_class(id=tool_call_ids[i], + function=FunctionCall( + name=tool_call.name, + arguments=json.dumps( + tool_call.parameters, + ensure_ascii=False))) + for i, tool_call in enumerate(tool_calls) + ], + reasoning_content=reasoning_content) # if the request doesn't use tool choice # OR specifies to not use a tool @@ -1248,7 +1278,6 @@ class OpenAIServingChat(OpenAIServing): if (tool_call_info.content and len(tool_call_info.content) > 0): ret_content = tool_call_info.content - message = ChatMessage(role=role, reasoning_content=reasoning_content, content=ret_content) @@ -1327,12 +1356,11 @@ class OpenAIServingChat(OpenAIServing): elif choice.message.tool_calls: # For tool calls, log the function name and arguments tool_call_descriptions = [] - for tool_call in choice.message.tool_calls: - if hasattr(tool_call.function, "name") and hasattr( - tool_call.function, "arguments"): + for tc in choice.message.tool_calls: + if hasattr(tc.function, "name") and hasattr( + tc.function, "arguments"): tool_call_descriptions.append( - f"{tool_call.function.name}({tool_call.function.arguments})" - ) + f"{tc.function.name}({tc.function.arguments})") tool_calls_str = ", ".join(tool_call_descriptions) output_text = f"[tool_calls: {tool_calls_str}]" diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py index da4760ad1b..ac272b0c3b 100644 --- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py @@ -6,7 +6,7 @@ from typing import Union import regex as re -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -267,7 +267,7 @@ class DeepSeekV3ToolParser(ToolParser): DeltaToolCall( index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True), diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index 5508ba6a39..824b100f35 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -10,7 +10,7 @@ import partial_json_parser import regex as re from partial_json_parser.core.options import Allow -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -203,7 +203,7 @@ class Granite20bFCToolParser(ToolParser): delta = DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py index fcc5b7edda..ac517616a9 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -8,7 +8,7 @@ from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -185,7 +185,7 @@ class GraniteToolParser(ToolParser): delta = DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index d126130ab9..a6ce33af6b 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -9,7 +9,7 @@ import partial_json_parser import regex as re from partial_json_parser.core.options import Allow -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -307,7 +307,7 @@ class Hermes2ProToolParser(ToolParser): return DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index 92004de030..6ef8fadf59 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -8,7 +8,7 @@ from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -107,7 +107,7 @@ class Internlm2ToolParser(ToolParser): delta = DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py index 66b483d8b0..3b41f60347 100644 --- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -9,7 +9,7 @@ import partial_json_parser import regex as re from partial_json_parser.core.options import Allow -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -222,7 +222,7 @@ class JambaToolParser(ToolParser): delta = DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 194a144ad5..31b19c8db4 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -10,7 +10,7 @@ import regex as re from partial_json_parser.core.options import Allow from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -213,7 +213,7 @@ class Llama3JsonToolParser(ToolParser): delta = DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py index 226309ef29..283e609501 100644 --- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py @@ -7,7 +7,7 @@ from typing import Any, Optional, Union import regex as re -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -394,7 +394,7 @@ class MinimaxToolParser(ToolParser): sent_tools.append({ "sent_name": False, "sent_arguments": "", - "id": random_tool_call_id(), + "id": make_tool_call_id(), }) while len(tool_ids) < tool_count: diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py index 5501028cf3..85dd56213c 100644 --- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py @@ -8,7 +8,7 @@ from typing import Any, Optional import regex as re from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaMessage, ExtractedToolCallInformation, @@ -74,7 +74,7 @@ class Phi4MiniJsonToolParser(ToolParser): tool_calls: list[ToolCall] = [ ToolCall( - id=random_tool_call_id(), + id=make_tool_call_id(), type="function", function=FunctionCall( name=raw_function_call["name"], diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py index 321718b1c9..87cd413b37 100644 --- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py @@ -7,7 +7,7 @@ from typing import Any, Optional, Union import regex as re -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -226,7 +226,7 @@ class xLAMToolParser(ToolParser): function_name = name_match.group(1) # The test expects us to send just the name first - tool_id = random_tool_call_id() + tool_id = make_tool_call_id() delta = DeltaMessage(tool_calls=[ DeltaToolCall( index=0, From b95697d7310637399998ebf1f21a26b523aa6611 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 20 Aug 2025 13:03:37 -0700 Subject: [PATCH 441/932] [Frontend] improve error logging of chat completion (#22957) Signed-off-by: Chen Zhang --- vllm/entrypoints/openai/api_server.py | 74 +++++++++++++++++++++------ 1 file changed, 57 insertions(+), 17 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 24148bcef2..14ba8aa641 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -600,8 +600,11 @@ async def create_responses(request: ResponsesRequest, raw_request: Request): if handler is None: return base(raw_request).create_error_response( message="The model does not support Responses API") - - generator = await handler.create_responses(request, raw_request) + try: + generator = await handler.create_responses(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -618,7 +621,11 @@ async def retrieve_responses(response_id: str, raw_request: Request): return base(raw_request).create_error_response( message="The model does not support Responses API") - response = await handler.retrieve_responses(response_id) + try: + response = await handler.retrieve_responses(response_id) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), @@ -633,7 +640,11 @@ async def cancel_responses(response_id: str, raw_request: Request): return base(raw_request).create_error_response( message="The model does not support Responses API") - response = await handler.cancel_responses(response_id) + try: + response = await handler.cancel_responses(response_id) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), @@ -667,9 +678,11 @@ async def create_chat_completion(request: ChatCompletionRequest, if handler is None: return base(raw_request).create_error_response( message="The model does not support Chat Completions API") - - generator = await handler.create_chat_completion(request, raw_request) - + try: + generator = await handler.create_chat_completion(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.error.code) @@ -742,7 +755,11 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): return base(raw_request).create_error_response( message="The model does not support Embeddings API") - generator = await handler.create_embedding(request, raw_request) + try: + generator = await handler.create_embedding(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -770,8 +787,11 @@ async def create_pooling(request: PoolingRequest, raw_request: Request): if handler is None: return base(raw_request).create_error_response( message="The model does not support Pooling API") - - generator = await handler.create_pooling(request, raw_request) + try: + generator = await handler.create_pooling(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.error.code) @@ -791,7 +811,11 @@ async def create_classify(request: ClassificationRequest, return base(raw_request).create_error_response( message="The model does not support Classification API") - generator = await handler.create_classify(request, raw_request) + try: + generator = await handler.create_classify(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.error.code) @@ -820,7 +844,11 @@ async def create_score(request: ScoreRequest, raw_request: Request): return base(raw_request).create_error_response( message="The model does not support Score API") - generator = await handler.create_score(request, raw_request) + try: + generator = await handler.create_score(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.error.code) @@ -878,8 +906,12 @@ async def create_transcriptions(raw_request: Request, message="The model does not support Transcriptions API") audio_data = await request.file.read() - generator = await handler.create_transcription(audio_data, request, - raw_request) + try: + generator = await handler.create_transcription(audio_data, request, + raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -919,8 +951,12 @@ async def create_translations(request: Annotated[TranslationRequest, message="The model does not support Translations API") audio_data = await request.file.read() - generator = await handler.create_translation(audio_data, request, - raw_request) + try: + generator = await handler.create_translation(audio_data, request, + raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -949,7 +985,11 @@ async def do_rerank(request: RerankRequest, raw_request: Request): if handler is None: return base(raw_request).create_error_response( message="The model does not support Rerank (Score) API") - generator = await handler.do_rerank(request, raw_request) + try: + generator = await handler.do_rerank(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.error.code) From bf7c99dfc40bff6844b2ae57554516922eb93b71 Mon Sep 17 00:00:00 2001 From: Saurabh Misra Date: Wed, 20 Aug 2025 13:17:11 -0700 Subject: [PATCH 442/932] [Perf] Speed up function `_convert_tokens_to_string_with_added_encoders` by 13.7x (#20413) Signed-off-by: Saurabh Misra Signed-off-by: Aseem Saxena Co-authored-by: codeflash-ai[bot] <148906541+codeflash-ai[bot]@users.noreply.github.com> Co-authored-by: Aseem Saxena --- vllm/transformers_utils/detokenizer_utils.py | 25 ++++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py index be1040c3e0..101f31d39c 100644 --- a/vllm/transformers_utils/detokenizer_utils.py +++ b/vllm/transformers_utils/detokenizer_utils.py @@ -23,27 +23,32 @@ def _convert_tokens_to_string_with_added_encoders( # NOTE(woosuk): The following code is slow because it runs a for loop over # the output_tokens. In Python, running a for loop over a list can be slow # even when the loop body is very simple. + # Performance improvements: avoid repeated attribute and function lookups; + # localize frequently used objects; + sub_texts: list[str] = [] current_sub_text: list[str] = [] - all_special_tokens = set(tokenizer.all_special_tokens) + convert_tokens_to_string = tokenizer.convert_tokens_to_string + added_vocab_set = set(tokenizer.get_added_vocab()) + all_special_tokens = set( + tokenizer.all_special_tokens) if skip_special_tokens else () + for token in output_tokens: - if skip_special_tokens and token in all_special_tokens: + # Use precomputed set for skip-special check + if token in all_special_tokens: continue - if token in tokenizer.get_added_vocab(): + if token in added_vocab_set: if current_sub_text: - sub_text = tokenizer.convert_tokens_to_string(current_sub_text) - sub_texts.append(sub_text) - current_sub_text = [] + sub_texts.append(convert_tokens_to_string(current_sub_text)) + current_sub_text.clear() sub_texts.append(token) else: current_sub_text.append(token) if current_sub_text: - sub_text = tokenizer.convert_tokens_to_string(current_sub_text) - sub_texts.append(sub_text) + sub_texts.append(convert_tokens_to_string(current_sub_text)) if spaces_between_special_tokens: return " ".join(sub_texts) - else: - return "".join(sub_texts) + return "".join(sub_texts) # 5 is an arbitrary value that should work for all From 4e51fa8cbaba2c6fd516b4615a533b0a94796516 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 20 Aug 2025 16:28:30 -0400 Subject: [PATCH 443/932] Do not use eval() to convert unknown types (#23266) Signed-off-by: Russell Bryant --- .../openai/tool_parsers/qwen3coder_tool_parser.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py index cf4d0b231a..2501d6739e 100644 --- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py @@ -208,15 +208,10 @@ class Qwen3CoderToolParser(ToolParser): "valid JSON object in tool '%s', will try other " "methods to parse it.", param_value, param_name, func_name) - try: - converted_value = eval(param_value) - return converted_value - except Exception: - logger.warning( - "Parsed value '%s' of parameter '%s' cannot be " - "converted via Python `eval()` in tool '%s', " - "degenerating to string.", param_value, param_name, - func_name) + logger.warning( + "Parameter '%s' has unknown type '%s'. " + "The value will be treated as a string.", param_name, + param_type) return param_value # Extract function name From 4fbda0b20cc539f72314375c2abc6100ebac8392 Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Thu, 21 Aug 2025 05:07:28 +0800 Subject: [PATCH 444/932] [Feature] use --eplb_config to set eplb param (#20562) Signed-off-by: rongfu.leng Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: rongfu.leng Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/__init__.py | 3 +- vllm/config/parallel.py | 108 +++++++++++++++++----- vllm/distributed/eplb/eplb_state.py | 4 +- vllm/engine/arg_utils.py | 63 +++++++++---- vllm/model_executor/models/deepseek_v2.py | 4 +- vllm/model_executor/models/glm4_moe.py | 4 +- vllm/model_executor/models/qwen3_moe.py | 7 +- vllm/v1/worker/gpu_model_runner.py | 4 +- vllm/v1/worker/gpu_worker.py | 4 +- 9 files changed, 149 insertions(+), 52 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 62dfd4333b..959f111ced 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -33,7 +33,8 @@ from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType, PrefixCachingHashAlgo) from vllm.config.compilation import (CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig) -from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig +from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, + ParallelConfig) from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy from vllm.config.utils import ConfigType, config from vllm.logger import init_logger diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 7a9e68f0ea..2b716a7706 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -6,7 +6,7 @@ from dataclasses import field from typing import TYPE_CHECKING, Any, Literal, Optional, Union import torch -from pydantic import model_validator +from pydantic import TypeAdapter, model_validator from pydantic.dataclasses import dataclass from torch.distributed import ProcessGroup, ReduceOp from typing_extensions import Self @@ -32,6 +32,38 @@ logger = init_logger(__name__) DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"] +@config +@dataclass +class EPLBConfig: + """Configuration for Expert Parallel Load Balancing (EP).""" + + window_size: int = 1000 + """Window size for expert load recording.""" + step_interval: int = 3000 + """ + Interval for rearranging experts in expert parallelism. + + Note that if this is greater than the EPLB window size, only the metrics + of the last `lb_window_size` steps will be used for rearranging experts. + """ + + num_redundant_experts: int = 0 + """Number of redundant experts to use for expert parallelism.""" + + log_balancedness: bool = False + """ + Log the balancedness each step of expert parallelism. + This is turned off by default since it will cause communication overhead. + """ + + @classmethod + def from_cli(cls, cli_value: str) -> "EPLBConfig": + """Parse the CLI value for the compilation config. + -O1, -O2, -O3, etc. is handled in FlexibleArgumentParser. + """ + return TypeAdapter(EPLBConfig).validate_json(cli_value) + + @config @dataclass class ParallelConfig: @@ -75,22 +107,24 @@ class ParallelConfig: """Use expert parallelism instead of tensor parallelism for MoE layers.""" enable_eplb: bool = False """Enable expert parallelism load balancing for MoE layers.""" - num_redundant_experts: int = 0 - """Number of redundant experts to use for expert parallelism.""" - eplb_window_size: int = 1000 - """Window size for expert load recording.""" - eplb_step_interval: int = 3000 - """ - Interval for rearranging experts in expert parallelism. - - Note that if this is greater than the EPLB window size, only the metrics - of the last `eplb_window_size` steps will be used for rearranging experts. - """ - eplb_log_balancedness: bool = False - """ - Log the balancedness each step of expert parallelism. - This is turned off by default since it will cause communication overhead. - """ + eplb_config: EPLBConfig = field(default_factory=EPLBConfig) + """Expert parallelism configuration.""" + num_redundant_experts: Optional[int] = None + """`num_redundant_experts` is deprecated and has been replaced with + `eplb_config.num_redundant_experts`. This will be removed in v0.12.0. + Please use `eplb_config.num_redundant_experts` instead.""" + eplb_window_size: Optional[int] = None + """`eplb_window_size` is deprecated and has been replaced with + `eplb_config.window_size`. This will be removed in v0.12.0. + Please use `eplb_config.window_size` instead.""" + eplb_step_interval: Optional[int] = None + """`eplb_step_interval` is deprecated and has been replaced with + `eplb_config.step_interval`. This will be removed in v0.12.0. + Please use `eplb_config.step_interval` instead.""" + eplb_log_balancedness: Optional[bool] = None + """`eplb_log_balancedness` is deprecated and has been replaced with + `eplb_config.log_balancedness`. This will be removed in v0.12.0. + Please use `eplb_config.log_balancedness` instead.""" max_parallel_loading_workers: Optional[int] = None """Maximum number of parallel loading workers when loading model @@ -237,6 +271,38 @@ class ParallelConfig: return hashlib.sha256(str(factors).encode()).hexdigest() def __post_init__(self) -> None: + # Forward deprecated fields to their new location + if self.num_redundant_experts is not None: + self.eplb_config.num_redundant_experts = ( + self.num_redundant_experts) + logger.warning_once( + "num_redundant_experts is deprecated and has been replaced " + "with eplb_config.num_redundant_experts. This will be removed " + "in v0.12.0. Changing this field after initialization will " + "have no effect.") + if self.eplb_window_size is not None: + self.eplb_config.window_size = self.eplb_window_size + logger.warning_once( + "eplb_window_size is deprecated and has been replaced " + "with eplb_config.window_size. This will be removed " + "in v0.12.0. Changing this field after initialization will " + "have no effect.") + if self.eplb_step_interval is not None: + self.eplb_config.step_interval = self.eplb_step_interval + logger.warning_once( + "eplb_step_interval is deprecated and has been replaced " + "with eplb_config.step_interval. This will be removed " + "in v0.12.0. Changing this field after initialization will " + "have no effect.") + if self.eplb_log_balancedness is not None: + self.eplb_config.log_balancedness = self.eplb_log_balancedness + logger.warning_once( + "eplb_log_balancedness is deprecated and has been replaced " + "with eplb_config.log_balancedness. This will be removed " + "in v0.12.0. Changing this field after initialization will " + "have no effect.") + + # Continue with the rest of the initialization self.world_size = self.pipeline_parallel_size * \ self.tensor_parallel_size @@ -275,10 +341,10 @@ class ParallelConfig: raise ValueError( "Expert parallelism load balancing is only supported on " "CUDA devices now.") - if self.num_redundant_experts < 0: + if self.eplb_config.num_redundant_experts < 0: raise ValueError( "num_redundant_experts must be non-negative, but got " - f"{self.num_redundant_experts}.") + f"{self.eplb_config.num_redundant_experts}.") if not self.enable_expert_parallel: raise ValueError( "enable_expert_parallel must be True to use EPLB.") @@ -289,10 +355,10 @@ class ParallelConfig: f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}." ) else: - if self.num_redundant_experts != 0: + if self.eplb_config.num_redundant_experts != 0: raise ValueError( "num_redundant_experts should be used with EPLB." - f"{self.num_redundant_experts}.") + f"{self.eplb_config.num_redundant_experts}.") if self.distributed_executor_backend is None and self.world_size > 1: # We use multiprocessing by default if world_size fits on the # current node and we aren't in a ray placement group. diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 979f2a06ce..042acf40d6 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -244,7 +244,7 @@ class EplbState: dtype=torch.int32, device=device, ) - expert_load_window_size = parallel_config.eplb_window_size + expert_load_window_size = parallel_config.eplb_config.window_size expert_load_window = torch.zeros( (expert_load_window_size, model.num_moe_layers, model.num_physical_experts), @@ -253,7 +253,7 @@ class EplbState: ) # Set the initial progress of rearrangement to 3/4 - eplb_step_interval = parallel_config.eplb_step_interval + eplb_step_interval = parallel_config.eplb_config.step_interval expert_rearrangement_step = max( 0, eplb_step_interval - eplb_step_interval // 4) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6869c3f23f..dcf7875894 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -25,7 +25,7 @@ import vllm.envs as envs from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, ConfigFormat, ConfigType, ConvertOption, DecodingConfig, DetailedTraceModules, Device, - DeviceConfig, DistributedExecutorBackend, + DeviceConfig, DistributedExecutorBackend, EPLBConfig, GuidedDecodingBackend, HfOverrides, KVEventsConfig, KVTransferConfig, LoadConfig, LogprobsMode, LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig, @@ -305,11 +305,12 @@ class EngineArgs: data_parallel_hybrid_lb: bool = False data_parallel_backend: str = ParallelConfig.data_parallel_backend enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel + eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config") enable_eplb: bool = ParallelConfig.enable_eplb - num_redundant_experts: int = ParallelConfig.num_redundant_experts - eplb_window_size: int = ParallelConfig.eplb_window_size - eplb_step_interval: int = ParallelConfig.eplb_step_interval - eplb_log_balancedness: bool = ParallelConfig.eplb_log_balancedness + num_redundant_experts: int = EPLBConfig.num_redundant_experts + eplb_window_size: int = EPLBConfig.window_size + eplb_step_interval: int = EPLBConfig.step_interval + eplb_log_balancedness: bool = EPLBConfig.log_balancedness max_parallel_loading_workers: Optional[ int] = ParallelConfig.max_parallel_loading_workers block_size: Optional[BlockSize] = CacheConfig.block_size @@ -454,6 +455,9 @@ class EngineArgs: if isinstance(self.compilation_config, dict): self.compilation_config = CompilationConfig( **self.compilation_config) + if isinstance(self.eplb_config, dict): + self.eplb_config = EPLBConfig.from_cli(json.dumps( + self.eplb_config)) # Setup plugins from vllm.plugins import load_general_plugins load_general_plugins() @@ -661,14 +665,32 @@ class EngineArgs: **parallel_kwargs["enable_expert_parallel"]) parallel_group.add_argument("--enable-eplb", **parallel_kwargs["enable_eplb"]) - parallel_group.add_argument("--num-redundant-experts", - **parallel_kwargs["num_redundant_experts"]) - parallel_group.add_argument("--eplb-window-size", - **parallel_kwargs["eplb_window_size"]) - parallel_group.add_argument("--eplb-step-interval", - **parallel_kwargs["eplb_step_interval"]) - parallel_group.add_argument("--eplb-log-balancedness", - **parallel_kwargs["eplb_log_balancedness"]) + parallel_group.add_argument("--eplb-config", + **parallel_kwargs["eplb_config"]) + parallel_group.add_argument( + "--num-redundant-experts", + type=int, + help= + "[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.", + deprecated=True) + parallel_group.add_argument( + "--eplb-window-size", + type=int, + help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.", + deprecated=True) + parallel_group.add_argument( + "--eplb-step-interval", + type=int, + help= + "[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.", + deprecated=True) + parallel_group.add_argument( + "--eplb-log-balancedness", + action=argparse.BooleanOptionalAction, + help= + "[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.", + deprecated=True) + parallel_group.add_argument( "--max-parallel-loading-workers", **parallel_kwargs["max_parallel_loading_workers"]) @@ -1244,6 +1266,16 @@ class EngineArgs: "Currently, speculative decoding is not supported with " "async scheduling.") + # Forward the deprecated CLI args to the EPLB config. + if self.num_redundant_experts is not None: + self.eplb_config.num_redundant_experts = self.num_redundant_experts + if self.eplb_window_size is not None: + self.eplb_config.window_size = self.eplb_window_size + if self.eplb_step_interval is not None: + self.eplb_config.step_interval = self.eplb_step_interval + if self.eplb_log_balancedness is not None: + self.eplb_config.log_balancedness = self.eplb_log_balancedness + parallel_config = ParallelConfig( pipeline_parallel_size=self.pipeline_parallel_size, tensor_parallel_size=self.tensor_parallel_size, @@ -1257,10 +1289,7 @@ class EngineArgs: data_parallel_hybrid_lb=self.data_parallel_hybrid_lb, enable_expert_parallel=self.enable_expert_parallel, enable_eplb=self.enable_eplb, - num_redundant_experts=self.num_redundant_experts, - eplb_window_size=self.eplb_window_size, - eplb_step_interval=self.eplb_step_interval, - eplb_log_balancedness=self.eplb_log_balancedness, + eplb_config=self.eplb_config, max_parallel_loading_workers=self.max_parallel_loading_workers, disable_custom_all_reduce=self.disable_custom_all_reduce, ray_workers_use_nsight=self.ray_workers_use_nsight, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index f199da135e..d56224b4b7 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -132,10 +132,10 @@ class DeepseekV2MoE(nn.Module): # Load balancing settings. vllm_config = get_current_vllm_config() - parallel_config = vllm_config.parallel_config + eplb_config = vllm_config.parallel_config.eplb_config self.enable_eplb = enable_eplb - self.n_redundant_experts = parallel_config.num_redundant_experts + self.n_redundant_experts = eplb_config.num_redundant_experts self.n_logical_experts = self.n_routed_experts self.n_physical_experts = (self.n_logical_experts + self.n_redundant_experts) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index aff491f959..fe5e46a998 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -131,10 +131,10 @@ class Glm4MoE(nn.Module): # Load balancing settings. vllm_config = get_current_vllm_config() - parallel_config = vllm_config.parallel_config + eplb_config = vllm_config.parallel_config.eplb_config self.enable_eplb = enable_eplb - self.n_redundant_experts = parallel_config.num_redundant_experts + self.n_redundant_experts = eplb_config.num_redundant_experts self.n_logical_experts = self.n_routed_experts self.n_physical_experts = (self.n_logical_experts + self.n_redundant_experts) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 05bbb0d2e8..2812f79a66 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -121,11 +121,11 @@ class Qwen3MoeSparseMoeBlock(nn.Module): # Load balancing settings. vllm_config = get_current_vllm_config() - parallel_config = vllm_config.parallel_config + eplb_config = vllm_config.parallel_config.eplb_config self.enable_eplb = enable_eplb self.n_logical_experts = self.n_routed_experts - self.n_redundant_experts = parallel_config.num_redundant_experts + self.n_redundant_experts = eplb_config.num_redundant_experts self.n_physical_experts = (self.n_logical_experts + self.n_redundant_experts) self.n_local_physical_experts = self.n_physical_experts // self.ep_size @@ -363,7 +363,8 @@ class Qwen3MoeModel(nn.Module): quant_config = vllm_config.quant_config parallel_config = vllm_config.parallel_config enable_eplb = parallel_config.enable_eplb - self.num_redundant_experts = parallel_config.num_redundant_experts + eplb_config = parallel_config.eplb_config + self.num_redundant_experts = eplb_config.num_redundant_experts self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d9770226b1..33747d6917 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1435,7 +1435,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): model, is_dummy, is_profile, - log_stats=self.parallel_config.eplb_log_balancedness, + log_stats=self.parallel_config.eplb_config.log_balancedness, ) def get_dp_padding(self, @@ -1977,7 +1977,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): global_expert_load, old_global_expert_indices = ( EplbState.recv_state()) num_logical_experts = global_expert_load.shape[1] - self.parallel_config.num_redundant_experts = ( + self.parallel_config.eplb_config.num_redundant_experts = ( num_local_physical_experts * new_ep_size - num_logical_experts) assert old_global_expert_indices.shape[ 1] % num_local_physical_experts == 0 diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 22e639b97d..d61177d424 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -515,7 +515,7 @@ class Worker(WorkerBase): assert self.model_runner.eplb_state is not None new_physical_experts = \ self.model_runner.eplb_state.physical_to_logical_map.shape[1] - parallel_config.num_redundant_experts = ( + parallel_config.eplb_config.num_redundant_experts = ( new_physical_experts - self.model_runner.eplb_state.logical_replica_count.shape[1]) global_expert_load = None @@ -531,7 +531,7 @@ class Worker(WorkerBase): assert self.model_runner.eplb_state is not None global_expert_load = self.model_runner.eplb_state.rearrange( self.model_runner.model, execute_shuffle=False) - parallel_config.num_redundant_experts = ( + parallel_config.eplb_config.num_redundant_experts = ( new_physical_experts - global_expert_load.shape[1]) prepare_communication_buffer_for_model(self.model_runner.model) self.model_runner.model.update_physical_experts_metadata( From 1b125004bea9f4cd120d3ce96dc1d3a2962ebace Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 21 Aug 2025 05:15:34 +0800 Subject: [PATCH 445/932] [misc] fix multiple arch wheels for the nightly index (#23110) Signed-off-by: youkaichao --- .buildkite/generate_index.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py index 7045d88104..6b5a2a9935 100644 --- a/.buildkite/generate_index.py +++ b/.buildkite/generate_index.py @@ -8,7 +8,8 @@ template = """

Links for vLLM

- {wheel}
+ {x86_wheel}
+ {arm_wheel}
""" @@ -21,7 +22,20 @@ filename = os.path.basename(args.wheel) with open("index.html", "w") as f: print(f"Generated index.html for {args.wheel}") + if "x86_64" in filename: + x86_wheel = filename + arm_wheel = filename.replace("x86_64", "aarch64") + elif "aarch64" in filename: + x86_wheel = filename.replace("aarch64", "x86_64") + arm_wheel = filename + else: + raise ValueError(f"Unsupported wheel: {filename}") # cloudfront requires escaping the '+' character f.write( - template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B")) + template.format( + x86_wheel=x86_wheel, + x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"), + arm_wheel=arm_wheel, + arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"), + ) ) From a4fbb32fab3d2f91b3672bf581565378aaa18d6c Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 20 Aug 2025 17:43:17 -0400 Subject: [PATCH 446/932] Remove chunked_prefill_enabled flag in V1 MLA (#23183) Signed-off-by: Matthew Bonanni --- vllm/v1/attention/backends/mla/common.py | 50 +++++++++++------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index f2610671f7..646e4fec83 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -416,7 +416,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): self.model_config = vllm_config.model_config cache_config = vllm_config.cache_config parallel_config = vllm_config.parallel_config - self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled self.num_heads = self.model_config.get_num_attention_heads( parallel_config) self.mla_dims = get_mla_dims(self.model_config) @@ -426,30 +425,28 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): if self.aot_schedule: self.page_size = self.kv_cache_spec.block_size - if self.chunked_prefill_enabled: - self.chunked_prefill_workspace_size = min( - # Max sure there is enough for 8 full length request or at least - # 4 pages of cache per request - max( - 8 * self.model_config.max_model_len, 4 * - scheduler_config.max_num_seqs * cache_config.block_size), - # For long-context models try not to over-allocate limiting - # kv-cache space, limiting it to 64k tokens, - # which would result in the workspace being: - # 2*(576)*(64*1024) = 144mb - # (assuming 576 MLA head dim, and fp16) - # which would result in up-projected context being - # 2*(192*128)*(64*1024) = 3gb - # (assuming 192 QK head dim, 128 heads, and fp16) - 128 * 1024) - assert self.chunked_prefill_workspace_size >= \ - scheduler_config.max_num_seqs * cache_config.block_size - self.chunked_prefill_workspace = torch.empty( - (self.chunked_prefill_workspace_size, - self.model_config.get_head_size()), - dtype=self.model_config.dtype, - device=device, - ) + self.chunked_prefill_workspace_size = min( + # Max sure there is enough for 8 full length request or at least + # 4 pages of cache per request + max(8 * self.model_config.max_model_len, + 4 * scheduler_config.max_num_seqs * cache_config.block_size), + # For long-context models try not to over-allocate limiting + # kv-cache space, limiting it to 64k tokens, + # which would result in the workspace being: + # 2*(576)*(64*1024) = 144mb + # (assuming 576 MLA head dim, and fp16) + # which would result in up-projected context being + # 2*(192*128)*(64*1024) = 3gb + # (assuming 192 QK head dim, 128 heads, and fp16) + 128 * 1024) + assert self.chunked_prefill_workspace_size >= \ + scheduler_config.max_num_seqs * cache_config.block_size + self.chunked_prefill_workspace = torch.empty( + (self.chunked_prefill_workspace_size, + self.model_config.get_head_size()), + dtype=self.model_config.dtype, + device=device, + ) self._use_cudnn_prefill = use_cudnn_prefill() self._use_fi_prefill = use_flashinfer_prefill() @@ -620,8 +617,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): reqs_start:] - query_start_loc[reqs_start] chunked_context_metadata = None - if self.chunked_prefill_enabled and num_prefills > 0 \ - and max_context_len_cpu > 0: + if max_context_len_cpu > 0: # NOTE: it is recommend you read the `Chunked Prefill` section # in the comment at the top of the file before trying to # understand the following code From 10cc12ba66834e33659f1ce3a00235506db20dd5 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 20 Aug 2025 17:46:47 -0400 Subject: [PATCH 447/932] Feature/mla tests (#23195) Signed-off-by: Matthew Bonanni Signed-off-by: Matthew Bonanni --- tests/v1/attention/test_attention_backends.py | 26 +- tests/v1/attention/test_mla_backends.py | 522 ++++++++++++++++++ tests/v1/attention/utils.py | 11 +- vllm/v1/attention/backends/mla/common.py | 16 +- 4 files changed, 551 insertions(+), 24 deletions(-) create mode 100644 tests/v1/attention/test_mla_backends.py diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index ac08b9052c..60e04ad906 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -150,15 +150,15 @@ def create_and_prepopulate_kv_cache( # Permute the context blocks (excluding block 0 which is null) if randomize_blocks: - perm = torch.randperm( - blocks_end - 1) + 1 # Random permutation starting from block 1 + # Random permutation starting from block 1 + perm = torch.randperm(blocks_end - 1) + 1 else: - perm = torch.arange( - 1, blocks_end) # Sequential order starting from block 1 + # Sequential order starting from block 1 + perm = torch.arange(1, blocks_end) inv_perm = torch.zeros(blocks_end, dtype=torch.long, device=device) - inv_perm[1:] = torch.argsort( - perm) + 1 # Add 1 to account for starting from block 1 + # Add 1 to account for starting from block 1 + inv_perm[1:] = torch.argsort(perm) + 1 kv_cache[:, 1:blocks_end, ...] = kv_cache[:, perm, ...] # Construct the right block table @@ -281,7 +281,8 @@ def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec, @pytest.mark.parametrize("batch_spec_name", [ "small_decode", "small_prefill", "mixed_small", "medium_decode", - "medium_prefill", "mixed_medium" + "medium_prefill", "mixed_medium", "large_decode", "large_prefill", + "single_decode", "single_prefill" ]) @pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"]) def test_backend_correctness(batch_spec_name: str, model: str): @@ -302,7 +303,8 @@ def test_backend_correctness(batch_spec_name: str, model: str): """ batch_spec = BATCH_SPECS[batch_spec_name] vllm_config = create_vllm_config(model_name=model, - max_model_len=max(batch_spec.seq_lens)) + max_model_len=max(batch_spec.seq_lens), + num_gpu_blocks=8192) device = torch.device("cuda:0") kv_cache_spec = create_standard_kv_cache_spec(vllm_config) @@ -465,12 +467,6 @@ def test_backend_correctness(batch_spec_name: str, model: str): rtol=rtol, atol=atol) - if not all_close: - print(f"[{backend_name}] output differs from SDPA baseline. " - f"Max diff: {max_diff:.6f} (rel: {max_rel_diff:.6f})") - print(f"[{backend_name}] output: {backend_output}") - print(f"[{backend_name}] SDPA baseline: {sdpa_output}") - assert all_close, ( f"[{backend_name}] output differs from SDPA baseline. " - f"Max diff: {max_diff:.6f} (rel: {max_rel_diff:.6f})") + f"Max diff: {max_diff:.6f}, max rel diff: {max_rel_diff:.6f})") \ No newline at end of file diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py new file mode 100644 index 0000000000..2407035879 --- /dev/null +++ b/tests/v1/attention/test_mla_backends.py @@ -0,0 +1,522 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for v1 MLA backends without GPUModelRunner dependency.""" + +import pytest +import torch + +from tests.v1.attention.utils import (BatchSpec, _Backend, + create_common_attn_metadata, + create_standard_kv_cache_spec, + create_vllm_config, + get_attention_backend) +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv +from vllm.v1.attention.backends.utils import CommonAttentionMetadata +from vllm.v1.kv_cache_interface import FullAttentionSpec + +BACKENDS_TO_TEST = [ + _Backend.CUTLASS_MLA, _Backend.FLASHMLA_VLLM_V1, + _Backend.TRITON_MLA_VLLM_V1 +] + +# Remove CUTLASS_MLA from the list if not using sm100 +if not torch.cuda.is_available() or torch.cuda.get_device_properties( + 0).major < 10: + BACKENDS_TO_TEST.remove(_Backend.CUTLASS_MLA) + +torch.manual_seed(42) + + +def _convert_dtype_to_torch(dtype): + """Convert ModelDType to torch.dtype.""" + if isinstance(dtype, str): + if dtype == "auto": + return torch.float16 # Default dtype for testing + elif dtype in STR_DTYPE_TO_TORCH_DTYPE: + return STR_DTYPE_TO_TORCH_DTYPE[dtype] + else: + raise ValueError(f"Unknown dtype: {dtype}") + elif isinstance(dtype, torch.dtype): + return dtype + else: + raise ValueError(f"Unknown dtype: {dtype}") + + +# Define common batch configurations +BATCH_SPECS = { + "small_decode": + BatchSpec(seq_lens=[32, 40], query_lens=[1, 1]), + "small_prefill": + BatchSpec(seq_lens=[32, 40], query_lens=[8, 8]), + "mixed_small": + BatchSpec(seq_lens=[32, 40, 48, 56], query_lens=[1, 1, 5, 5]), + "medium_decode": + BatchSpec(seq_lens=[128, 256, 512, 1024, 128, 256, 512, 1024], + query_lens=[1, 1, 1, 1, 1, 1, 1, 1]), + "medium_prefill": + BatchSpec(seq_lens=[256, 512, 1024, 2048], query_lens=[16, 16, 16, 16]), + "mixed_medium": + BatchSpec(seq_lens=[512, 1024, 2048, 512, 1024, 2048], + query_lens=[1, 1, 1, 7, 7, 7]), + "large_decode": + BatchSpec(seq_lens=[2048] * 32, query_lens=[1] * 32), + "large_prefill": + BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8), + "single_decode": + BatchSpec(seq_lens=[1024], query_lens=[1]), + "single_prefill": + BatchSpec(seq_lens=[1024], query_lens=[64]), +} + + +def create_dummy_kv_cache(kv_cache_spec: FullAttentionSpec, + device: torch.device, + num_blocks: int = 100) -> torch.Tensor: + """Create a dummy KV cache tensor for testing.""" + kv_cache = torch.randn( + num_blocks, + kv_cache_spec.block_size, + kv_cache_spec.head_size, # latent dimension + dtype=_convert_dtype_to_torch(kv_cache_spec.dtype), + device=device, + ) + return kv_cache + + +def create_and_prepopulate_kv_cache( + kv_c_contexts: list[torch.Tensor], + k_pe_contexts: list[torch.Tensor], + block_size: int, + num_kv_heads: int, + head_size: int, + dtype: torch.dtype, + device: torch.device, + num_blocks: int, + common_attn_metadata: CommonAttentionMetadata, + randomize_blocks: bool = True) -> torch.Tensor: + """Create and prepopulate an MLA KV cache with context data. + + Args: + kv_c_contexts: List of latent KV context tensors for each sequence + k_pe_contexts: List of key positional embedding context tensors + for each sequence + block_size: Size of each block + num_kv_heads: Number of KV heads (should be 1 for MLA) + head_size: Size of each head (latent dimension) + dtype: Data type for the cache + device: Device to create the cache on + num_blocks: Total number of blocks in the cache + common_attn_metadata: Common attention metadata + randomize_blocks: Whether to randomly permute blocks + or use sequential order + + Returns: + MLA KV cache tensor + """ + batch_size = len(kv_c_contexts) + seq_lens = common_attn_metadata.seq_lens_cpu + query_lens = common_attn_metadata.query_start_loc_cpu[ + 1:] - common_attn_metadata.query_start_loc_cpu[:-1] + context_lens = common_attn_metadata.num_computed_tokens_cpu + block_table = common_attn_metadata.block_table_tensor + slot_mapping = common_attn_metadata.slot_mapping + + # Create MLA KV cache: (num_blocks, block_size, head_size) + kv_cache = torch.empty(num_blocks, + block_size, + head_size, + dtype=dtype, + device=device) + kv_cache_flat = kv_cache.view(-1, head_size) + + # Populate the cache with the context tokens + # Start from block_id=1 since block_id=0 is considered the null block + start_block_idx = 1 + for i in range(batch_size): + kv_c_context, k_pe_context = kv_c_contexts[i], k_pe_contexts[i] + kv_context = torch.cat([kv_c_context, k_pe_context.squeeze(1)], dim=-1) + start = start_block_idx * block_size + end = start + kv_context.shape[0] + kv_cache_flat[start:end, ...] = kv_context + + # Stay block aligned and allocate enough blocks for the new tokens + start_block_idx += cdiv(int(seq_lens[i]), block_size) + + blocks_end = start_block_idx + + # Permute the context blocks (excluding block 0 which is null) + if randomize_blocks: + perm = torch.randperm( + blocks_end - 1) + 1 # Random permutation starting from block 1 + else: + perm = torch.arange( + 1, blocks_end) # Sequential order starting from block 1 + + inv_perm = torch.zeros(blocks_end, dtype=torch.long, device=device) + inv_perm[1:] = torch.argsort( + perm) + 1 # Add 1 to account for starting from block 1 + kv_cache[1:blocks_end, ...] = kv_cache[perm, ...] + + # Construct the right block table + # Start from block_id=1 since block_id=0 is considered the null block + start_block_idx = 1 + for i in range(batch_size): + num_blocks_for_seq = cdiv(int(seq_lens[i]), block_size) + start = start_block_idx + end = start + num_blocks_for_seq + block_table[i, :num_blocks_for_seq] = inv_perm[start:end] + start_block_idx += num_blocks_for_seq + + # Create a realistic slot mapping that corresponds to the block table + for i in range(batch_size): + token_offsets = torch.arange(int(query_lens[i])) + int(context_lens[i]) + block_indices = token_offsets // block_size + token_inter_block_offsets = token_offsets % block_size + start = common_attn_metadata.query_start_loc_cpu[i] + end = common_attn_metadata.query_start_loc_cpu[i + 1] + slot_mapping[start:end] = block_table[ + i, + block_indices] * block_size + token_inter_block_offsets.to(device) + + return kv_cache + + +class MockAttentionLayer: + """A mock attention layer for testing.""" + + def __init__(self, device: torch.device): + self._q_scale = torch.tensor(1.0, device=device) + self._k_scale = torch.tensor(1.0, device=device) + self._v_scale = torch.tensor(1.0, device=device) + + +def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec, + layer_names: list[str], vllm_config, + device: torch.device, + common_attn_metadata: CommonAttentionMetadata, + query: torch.Tensor, kv_c: torch.Tensor, + k_pe: torch.Tensor, kv_cache: torch.Tensor, + kv_lora_rank: int, qk_nope_head_dim: int, + qk_rope_head_dim: int, v_head_dim: int, + mock_kv_b_proj) -> torch.Tensor: + """Run attention computation using the specified backend's AttentionImpl.""" + + builder_cls, impl_cls = get_attention_backend(backend) + + # Build metadata + builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device) + attn_metadata = builder.build( + common_prefix_len=0, + common_attn_metadata=common_attn_metadata, + ) + + # Instantiate MLA implementation + num_heads = vllm_config.model_config.get_num_attention_heads( + vllm_config.parallel_config) + num_kv_heads = vllm_config.model_config.get_num_kv_heads( + vllm_config.parallel_config) + head_size = vllm_config.model_config.get_head_size() + scale = 1.0 / (head_size**0.5) + impl = impl_cls( + num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + alibi_slopes=None, + sliding_window=None, + kv_cache_dtype="auto", + logits_soft_cap=None, + attn_type="decoder", + kv_sharing_target_layer_name=None, + q_lora_rank=None, + kv_lora_rank=kv_lora_rank, + qk_nope_head_dim=qk_nope_head_dim, + qk_rope_head_dim=qk_rope_head_dim, + qk_head_dim=qk_nope_head_dim + qk_rope_head_dim, + v_head_dim=v_head_dim, + kv_b_proj=mock_kv_b_proj, + ) + + # Process weights to create W_UK_T and W_UV attributes needed by MLA + act_dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype) + impl.process_weights_after_loading(act_dtype) + + # Create mock layer and output buffer + mock_layer = MockAttentionLayer(device) + num_tokens = query.shape[0] + output = torch.empty(num_tokens, + num_heads * v_head_dim, + dtype=query.dtype, + device=query.device) + + # Run forward pass + # NOTE: The query, key, and value are already shaped correctly + # in the calling test function. + output = impl.forward(mock_layer, + query, + kv_c, + k_pe, + kv_cache, + attn_metadata, + output=output) + + return output + + +@pytest.mark.parametrize("batch_spec_name", [ + "small_decode", "small_prefill", "mixed_small", "medium_decode", + "medium_prefill", "mixed_medium", "large_decode", "large_prefill", + "single_decode", "single_prefill" +]) +@pytest.mark.parametrize("model", ["deepseek-ai/DeepSeek-V2-Lite-Chat"]) +def test_backend_correctness(dist_init, batch_spec_name: str, model: str): + """ + Test that all backends produce similar outputs to a reference implementation + using torch.nn.functional.scaled_dot_product_attention. + + This test works by: + 1. Generating a batch of sequences with specified context and query lengths. + 2. Computing a ground-truth attention output using torch.sdpa on + contiguous Q, K, and V tensors. + 3. Simulating vLLM's paged KV cache: It takes the context portion of the + K/V tensors and manually places them into a paged buffer according to + the test's (randomly generated) block table. + 4. Running each vLLM attention backend with the new queries and the + simulated paged KV cache. + 5. Comparing the vLLM backend's output to the ground-truth SDPA output. + """ + batch_spec = BATCH_SPECS[batch_spec_name] + vllm_config = create_vllm_config(model_name=model, + max_model_len=max(batch_spec.seq_lens), + num_gpu_blocks=2048) + device = torch.device("cuda:0") + + kv_cache_spec = create_standard_kv_cache_spec(vllm_config) + + # 1. Setup + batch_size = batch_spec.batch_size + seq_lens = batch_spec.seq_lens + query_lens = batch_spec.query_lens + num_q_heads = vllm_config.model_config.get_num_attention_heads( + vllm_config.parallel_config) + num_kv_heads = vllm_config.model_config.get_num_kv_heads( + vllm_config.parallel_config) + head_size = vllm_config.model_config.get_head_size() + dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype) + block_size = vllm_config.cache_config.block_size + kv_lora_rank = 512 + qk_rope_head_dim = 64 + qk_nope_head_dim = 128 + v_head_dim = 128 + total_head_size = kv_lora_rank + qk_rope_head_dim + assert kv_lora_rank + qk_rope_head_dim == head_size, \ + f"MLA dimensions don't match: {total_head_size} != {head_size}" + scale = 1.0 / (total_head_size**0.5) + + # 2. Generate data and compute SDPA reference output for MLA + all_q_vllm, all_kv_c_vllm, all_k_pe_vllm = [], [], [] + all_sdpa_outputs = [] + kv_c_contexts, k_pe_contexts = [], [] + + # Create shared MLA weight matrices for consistency across all sequences + W_UK = torch.randn(kv_lora_rank, + num_q_heads, + qk_nope_head_dim, + dtype=dtype, + device=device) + W_UV = torch.randn(kv_lora_rank, + num_q_heads, + v_head_dim, + dtype=dtype, + device=device) + kv_b_proj_weight = torch.cat([W_UK, W_UV], dim=-1) + + for i in range(batch_size): + s_len = seq_lens[i] + q_len = query_lens[i] + context_len = s_len - q_len + + # Generate MLA tensors + # Q has both nope and rope components: + # [q_len, num_heads, qk_nope_head_dim + qk_rope_head_dim] + q_c = torch.randn(q_len, + num_q_heads, + qk_nope_head_dim + qk_rope_head_dim, + dtype=dtype, + device=device) + + # KV_C (latent K/V): [s_len, kv_lora_rank] + kv_c_full = torch.randn(s_len, + kv_lora_rank, + dtype=dtype, + device=device) + + # K_PE (rope component): [s_len, 1, qk_rope_head_dim] + k_pe_full = torch.randn(s_len, + 1, + qk_rope_head_dim, + dtype=dtype, + device=device) + + # Determine if this is decode (single token) + # or prefill (multiple tokens) + is_decode = q_len == 1 + + # Split q into nope and rope components + q_nope, q_pe = q_c.split([qk_nope_head_dim, qk_rope_head_dim], dim=-1) + + if is_decode: + # Decode path: MQA-style attention in latent space + # Transform q_nope to latent space: q_nope @ W_UK + # q_nope: [1, num_heads, qk_nope_head_dim] + # W_UK: [kv_lora_rank, num_heads, qk_nope_head_dim] + ql_nope = torch.einsum("qnh,lnh->qnl", q_nope, + W_UK) # [1, num_heads, kv_lora_rank] + + # Build MQA attention inputs + # Q: [1, num_heads, kv_lora_rank + qk_rope_head_dim] + q_mqa = torch.cat([ql_nope, q_pe], dim=-1) + # K: [s_len, kv_lora_rank + qk_rope_head_dim] + # (broadcasted to all heads) + k_mqa = torch.cat([kv_c_full, k_pe_full.squeeze(1)], dim=-1) + k_mqa = k_mqa.unsqueeze(1).expand(-1, num_q_heads, -1) + # V: [s_len, kv_lora_rank] (broadcasted to all heads) + v_mqa = kv_c_full.unsqueeze(1).expand(-1, num_q_heads, -1) + + # SDPA expects (N, H, L, D) + q_sdpa_in = q_mqa.unsqueeze(0).transpose(1, 2) + k_sdpa_in = k_mqa.unsqueeze(0).transpose(1, 2) + v_sdpa_in = v_mqa.unsqueeze(0).transpose(1, 2) + + sdpa_out_i = torch.nn.functional.scaled_dot_product_attention( + q_sdpa_in, k_sdpa_in, v_sdpa_in, is_causal=False, scale=scale) + sdpa_out_i = sdpa_out_i.transpose(1, 2).squeeze( + 0) # [1, num_heads, kv_lora_rank] + + # Project back to output space: sdpa_out @ W_UV + sdpa_out_i = torch.einsum("qnl,lnv->qnv", sdpa_out_i, W_UV) + sdpa_out_i = sdpa_out_i.flatten(start_dim=-2) + else: + # Prefill path: MHA-style attention with full sequence + # Apply kv_b_proj to the full kv_c tensor + kv_nope_full = torch.einsum("sl,lnh->snh", kv_c_full, + kv_b_proj_weight) + k_nope_full, v_full = kv_nope_full.split( + [qk_nope_head_dim, v_head_dim], dim=-1) + + # Build attention inputs for full sequence + q_mha = torch.cat([q_nope, q_pe], + dim=-1) # [q_len, num_heads, total_dim] + k_pe_full_expanded = k_pe_full.expand(-1, num_q_heads, -1) + k_full = torch.cat([k_nope_full, k_pe_full_expanded], dim=-1) + + # Create custom attention mask: + # - Query tokens can attend to all context tokens + # - Query tokens can only attend to query tokens up to their pos + attn_mask = torch.ones(q_len, + s_len, + dtype=torch.bool, + device=device) + # Apply causal mask only to the query portion (context_len onwards) + causal_mask = torch.tril(torch.ones(q_len, q_len, device=device)) + attn_mask[:, context_len:] = causal_mask + + # SDPA expects (N, H, L, D) + q_sdpa_in = q_mha.unsqueeze(0).transpose(1, 2) + k_sdpa_in = k_full.unsqueeze(0).transpose(1, 2) + v_sdpa_in = v_full.unsqueeze(0).transpose(1, 2) + + # Single attention call with custom mask + sdpa_out_i = torch.nn.functional.scaled_dot_product_attention( + q_sdpa_in, + k_sdpa_in, + v_sdpa_in, + attn_mask=attn_mask, + scale=scale) + sdpa_out_i = sdpa_out_i.transpose(1, 2).squeeze(0) + sdpa_out_i = sdpa_out_i.flatten(start_dim=-2) + + all_sdpa_outputs.append(sdpa_out_i) + + # Inputs for vLLM MLA backends are just the new tokens + all_q_vllm.append(q_c) + all_kv_c_vllm.append(kv_c_full[context_len:]) # New kv_c tokens + all_k_pe_vllm.append(k_pe_full[context_len:]) # New k_pe tokens + + # Contextual K/V data used to populate the paged cache (MLA format) + kv_c_contexts.append(kv_c_full[:context_len]) + k_pe_contexts.append(k_pe_full[:context_len]) + + # Concatenate all sequences (no reordering needed) + query_vllm = torch.cat(all_q_vllm, dim=0) + kv_c_vllm = torch.cat(all_kv_c_vllm, dim=0) + k_pe_vllm = torch.cat(all_k_pe_vllm, dim=0) + sdpa_output = torch.cat(all_sdpa_outputs, dim=0) + + # Create mock kv_b_proj using the same weights as reference implementation + from vllm.model_executor.layers.linear import ColumnParallelLinear + mock_kv_b_proj = ColumnParallelLinear(input_size=kv_lora_rank, + output_size=num_q_heads * + (qk_nope_head_dim + v_head_dim), + bias=False).to(device=device, + dtype=dtype) + + # Set the mock weights to match our reference implementation + # Reshape W_UK and W_UV to match the expected kv_b_proj format + # [kv_lora_rank, num_heads, qk_nope_head_dim + v_head_dim] + kv_b_proj_weight = kv_b_proj_weight.view( + kv_lora_rank, num_q_heads * (qk_nope_head_dim + v_head_dim)) + mock_kv_b_proj.weight = torch.nn.Parameter(kv_b_proj_weight.T) + + # Create metadata using original batch spec + common_attn_metadata = create_common_attn_metadata( + batch_spec, vllm_config.cache_config.block_size, device) + + # 3. Simulate Paged KV Cache and a realistic slot_mapping + kv_cache = create_and_prepopulate_kv_cache( + kv_c_contexts=kv_c_contexts, + k_pe_contexts=k_pe_contexts, + block_size=block_size, + num_kv_heads=num_kv_heads, + head_size=head_size, + dtype=dtype, + device=device, + num_blocks=vllm_config.cache_config.num_gpu_blocks, + common_attn_metadata=common_attn_metadata, + randomize_blocks=True) + + # 4. Run vLLM backends and compare + for backend_name in BACKENDS_TO_TEST: + backend_output = run_attention_backend( + backend_name, kv_cache_spec, ["placeholder"], vllm_config, device, + common_attn_metadata, query_vllm, kv_c_vllm, k_pe_vllm, kv_cache, + kv_lora_rank, qk_nope_head_dim, qk_rope_head_dim, v_head_dim, + mock_kv_b_proj) + + # Check shape and dtype consistency + assert backend_output.shape == sdpa_output.shape, ( + f"[{backend_name}] shape {backend_output.shape} != " + f"SDPA shape {sdpa_output.shape}") + assert backend_output.dtype == sdpa_output.dtype, ( + f"[{backend_name}] dtype {backend_output.dtype} != " + f"SDPA dtype {sdpa_output.dtype}") + + assert torch.isfinite(backend_output).all(), ( + f"[{backend_name}] produced non-finite values") + + # Check numerical similarity + rtol = 1e-2 + atol = 5e-1 + + max_diff = torch.max(torch.abs(backend_output - sdpa_output)).item() + max_rel_diff = torch.max( + torch.abs(backend_output - sdpa_output) / + torch.abs(sdpa_output)).item() + all_close = torch.allclose(backend_output, + sdpa_output, + rtol=rtol, + atol=atol) + + assert all_close, ( + f"[{backend_name}] output differs from SDPA baseline. " + f"Max diff: {max_diff:.6f}, max rel diff: {max_rel_diff:.6f})") diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index e547e71e0c..6a08cdc56f 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -135,6 +135,12 @@ def get_attention_backend(backend_name: _Backend): "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend", _Backend.XFORMERS_VLLM_V1: "vllm.v1.attention.backends.xformers.XFormersAttentionBackend", + _Backend.CUTLASS_MLA: + "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend", + _Backend.FLASHMLA_VLLM_V1: + "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend", + _Backend.TRITON_MLA_VLLM_V1: + "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend", } if backend_name not in backend_map: @@ -167,9 +173,11 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B", tensor_parallel_size: int = 1, max_model_len: int = 1024, dtype: Union[ModelDType, torch.dtype] = "auto", + num_gpu_blocks: int = 1000, block_size: int = 16, max_num_seqs: int = 256, max_num_batched_tokens: int = 8192, + enable_chunked_prefill: bool = True, add_mock_model_methods: bool = True) -> VllmConfig: """Create a VllmConfig for testing with reasonable defaults.""" @@ -189,7 +197,7 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B", ) # Set cache blocks for testing # (these may be set during initialization normally) - cache_config.num_gpu_blocks = 1000 + cache_config.num_gpu_blocks = num_gpu_blocks cache_config.num_cpu_blocks = 0 parallel_config = ParallelConfig( @@ -198,6 +206,7 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B", scheduler_config = SchedulerConfig( max_num_seqs=max_num_seqs, max_num_batched_tokens=max_num_batched_tokens, + enable_chunked_prefill=enable_chunked_prefill, ) device_config = DeviceConfig() diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 646e4fec83..03028ebfe7 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -24,7 +24,7 @@ Main reference: DeepseekV2 paper, and FlashInfer Implementation (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551). Deepseek's MLA attention works the following way: -* Use a single latent vector to represent the per-token entry of the KV cache. +* Use a single latent vector to represent the per-token entry of the KV cache. * For decode (i.e. the memory friendly approach) the attention "simulates" a multi-head attention, while the compute is similar to multi-query attention. @@ -82,7 +82,7 @@ spda_o = scaled_dot_product_attention( torch.cat([q_nope, q_pe], dim=-1), torch.cat([k_nope, k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1), v -) +) return spda_o @ W_O NOTE: in the actual code, @@ -120,20 +120,20 @@ return o.view(-1, N * V) @ self.num_heads @ W_O ## Chunked Prefill -For chunked prefill we want to use the compute friendly algorithm. We are -assuming sufficiently large Sq / Skv ratio, in the future may want to switch to +For chunked prefill we want to use the compute friendly algorithm. We are +assuming sufficiently large Sq / Skv ratio, in the future may want to switch to the data-movement friendly approach if the chunk (i.e. `Sq`) is small. However, the compute-friendly approach can potentially run out of memory if Skv is large due to: `k_nope = (kv_c @ W_UK).view(Skv, N, P)` -To mitigate this, we chunk the computation of attention with respect to the -current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a +To mitigate this, we chunk the computation of attention with respect to the +current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a fixed workspace size. The chunked prefill approach is as follows: -MCC Max chunk of context to process per iter, computed dynamically, +MCC Max chunk of context to process per iter, computed dynamically, used to bound the memory usage q_c = h_t @ W_DQ @@ -155,7 +155,7 @@ curr_o, curr_lse = scaled_dot_product_attention( new_v, casual=True, return_softmax_lse=True -) +) // Compute attention with the already existing context for chunk_idx in range(cdiv(C, MCC)): From c86af22f31838ee654c856279ac5110ae3fdb2cc Mon Sep 17 00:00:00 2001 From: shixianc <49539556+shixianc@users.noreply.github.com> Date: Wed, 20 Aug 2025 15:04:21 -0700 Subject: [PATCH 448/932] [Fix] remove is_marlin param in benchmark_moe (#23286) From 4b795020eda910ecf16c289a23c4a6c119a4b43b Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Wed, 20 Aug 2025 16:46:06 -0700 Subject: [PATCH 449/932] [EP] Add logging for experts map (#22685) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: Simon Mo --- vllm/model_executor/layers/fused_moe/layer.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index aa8ceda1bb..b16c21b701 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -695,6 +695,26 @@ def determine_expert_map( return (local_num_experts, expert_map) +def get_compressed_expert_map(expert_map: torch.Tensor) -> str: + """ + Compresses the expert map by removing any -1 entries. + + Args: + expert_map (torch.Tensor): A tensor of shape (global_num_experts,) + mapping from global to local index. Contains -1 for experts not + assigned to the current rank. + + Returns: + str: A string mapping from local to global index. + Using str to support hashing for logging once only. + """ + global_indices = torch.where(expert_map != -1)[0] + local_indices = expert_map[global_indices] + return ", ".join( + f"{local_index.item()}->{global_index.item()}" + for local_index, global_index in zip(local_indices, global_indices)) + + @CustomOp.register("fused_moe") class FusedMoE(CustomOp): """FusedMoE layer for MoE models. @@ -795,6 +815,12 @@ class FusedMoE(CustomOp): ep_size=self.ep_size, ep_rank=self.ep_rank, global_num_experts=self.global_num_experts) + logger.info_once( + "[EP Rank %s/%s] Expert parallelism is enabled. Local/global" + " number of experts: %s/%s. Experts local to global index map:" + " %s.", self.ep_rank, self.ep_size, self.local_num_experts, + self.global_num_experts, + get_compressed_expert_map(self.expert_map)) else: self.local_num_experts, self.expert_map = (self.global_num_experts, None) From f5aa307d7795b8400d3719087c502c2a227030c7 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 20 Aug 2025 20:14:59 -0400 Subject: [PATCH 450/932] Remove duplicate entry in vllm.attention.__all__ (#23296) Signed-off-by: Russell Bryant --- vllm/attention/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py index 344040586a..dcb2aa68fb 100644 --- a/vllm/attention/__init__.py +++ b/vllm/attention/__init__.py @@ -14,7 +14,6 @@ __all__ = [ "AttentionMetadata", "AttentionType", "AttentionMetadataBuilder", - "Attention", "AttentionState", "get_attn_backend", ] From bbea1cefdd1a29b53355b1655f5d2ae343921f85 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 20 Aug 2025 20:18:12 -0400 Subject: [PATCH 451/932] [CI Bugfix] Fix CI by fully removing --enable-prompt-adapter (#23284) Signed-off-by: mgoin --- vllm/engine/arg_utils.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index dcf7875894..f3afc015f6 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -888,12 +888,6 @@ class EngineArgs: parser.add_argument('--disable-log-stats', action='store_true', help='Disable logging statistics.') - parser.add_argument('--enable-prompt-adapter', - action='store_true', - deprecated=True, - help='[DEPRECATED] Prompt adapter has been ' - 'removed. Setting this flag to True or False' - ' has no effect on vLLM behavior.') return parser From b029de9902aa3ac58806c8c17776c7074175b6db Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 20 Aug 2025 18:25:56 -0700 Subject: [PATCH 452/932] [Optimization] Make new_block_ids None if empty (#23262) Signed-off-by: Woosuk Kwon --- vllm/v1/core/kv_cache_manager.py | 30 ++++++++++++++++++++++++++---- vllm/v1/core/sched/output.py | 2 +- vllm/v1/core/sched/scheduler.py | 24 ++++++++++++------------ vllm/v1/worker/gpu_model_runner.py | 14 +++++++++----- vllm/v1/worker/tpu_model_runner.py | 14 +++++++++----- 5 files changed, 57 insertions(+), 27 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index bfaa7ab08f..fd0bdb2c80 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Optional +from typing import Literal, Optional, overload from vllm.distributed.kv_events import KVCacheEvent from vllm.logger import init_logger @@ -37,7 +37,24 @@ class KVCacheBlocks: tuple(blk1 + blk2 for blk1, blk2 in zip(self.blocks, other.blocks))) - def get_block_ids(self) -> tuple[list[int], ...]: + @overload + def get_block_ids( + self, + allow_none: Literal[False] = False, + ) -> tuple[list[int], ...]: + ... + + @overload + def get_block_ids( + self, + allow_none: Literal[True] = True, + ) -> Optional[tuple[list[int], ...]]: + ... + + def get_block_ids( + self, + allow_none: bool = False, + ): """ Converts the KVCacheBlocks instance to block_ids. @@ -46,6 +63,8 @@ class KVCacheBlocks: * the outer tuple corresponds to KV cache groups * each inner list contains the block_ids of the blocks in that group """ + if allow_none and all(len(group) == 0 for group in self.blocks): + return None return tuple([blk.block_id for blk in group] for group in self.blocks) def get_unhashed_block_ids(self) -> list[int]: @@ -348,10 +367,13 @@ class KVCacheManager: """ return self.block_pool.take_events() + def get_blocks(self, request_id: str) -> KVCacheBlocks: + """Get the blocks of a request.""" + return KVCacheBlocks(self.coordinator.get_blocks(request_id)) + def get_block_ids(self, request_id: str) -> tuple[list[int], ...]: """Get the block ids of a request.""" - return KVCacheBlocks( - self.coordinator.get_blocks(request_id)).get_block_ids() + return self.get_blocks(request_id).get_block_ids() def cache_blocks(self, request: Request, num_computed_tokens: int) -> None: """Cache the blocks for the request, if enabled.""" diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index fac07f9719..9ba7ec9d96 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -91,7 +91,7 @@ class CachedRequestData: # NOTE(woosuk): new_token_ids is only used for pipeline parallelism. # When PP is not used, new_token_ids will be empty. new_token_ids: list[list[int]] - new_block_ids: list[tuple[list[int], ...]] + new_block_ids: list[Optional[tuple[list[int], ...]]] num_computed_tokens: list[int] @property diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 4b167da5c8..0b528587b9 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -19,7 +19,7 @@ from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager, compute_encoder_budget) -from vllm.v1.core.kv_cache_manager import KVCacheManager +from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager from vllm.v1.core.sched.interface import SchedulerInterface from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput) @@ -185,7 +185,7 @@ class Scheduler(SchedulerInterface): # uses structured decoding. structured_output_request_ids: dict[str, int] = {} - req_to_new_block_ids: dict[str, tuple[list[int], ...]] = {} + req_to_new_blocks: dict[str, KVCacheBlocks] = {} num_scheduled_tokens: dict[str, int] = {} token_budget = self.max_num_scheduled_tokens # Encoder-related. @@ -288,8 +288,7 @@ class Scheduler(SchedulerInterface): # Therefore, we might introduce some additional # cycle to fill in the bitmask, which could be a big no-op. structured_output_request_ids[request.request_id] = req_index - req_to_new_block_ids[request.request_id] = ( - new_blocks.get_block_ids()) + req_to_new_blocks[request.request_id] = new_blocks num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens req_index += 1 @@ -496,8 +495,8 @@ class Scheduler(SchedulerInterface): if self.lora_config and request.lora_request: scheduled_loras.add(request.lora_request.lora_int_id) - req_to_new_block_ids[request.request_id] = ( - self.kv_cache_manager.get_block_ids(request.request_id)) + req_to_new_blocks[request.request_id] = ( + self.kv_cache_manager.get_blocks(request.request_id)) num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens request.status = RequestStatus.RUNNING @@ -546,8 +545,8 @@ class Scheduler(SchedulerInterface): ) # Construct the scheduler output. new_reqs_data = [ - NewRequestData.from_request(req, - req_to_new_block_ids[req.request_id]) + NewRequestData.from_request( + req, req_to_new_blocks[req.request_id].get_block_ids()) for req in scheduled_new_reqs ] cached_reqs_data = self._make_cached_request_data( @@ -555,7 +554,7 @@ class Scheduler(SchedulerInterface): scheduled_resumed_reqs, num_scheduled_tokens, scheduled_spec_decode_tokens, - req_to_new_block_ids, + req_to_new_blocks, ) scheduler_output = SchedulerOutput( scheduled_new_reqs=new_reqs_data, @@ -628,11 +627,11 @@ class Scheduler(SchedulerInterface): resumed_reqs: list[Request], num_scheduled_tokens: dict[str, int], spec_decode_tokens: dict[str, list[int]], - req_to_new_block_ids: dict[str, tuple[list[int], ...]], + req_to_new_blocks: dict[str, KVCacheBlocks], ) -> CachedRequestData: req_ids: list[str] = [] new_token_ids: list[list[int]] = [] - new_block_ids: list[tuple[list[int], ...]] = [] + new_block_ids: list[Optional[tuple[list[int], ...]]] = [] num_computed_tokens: list[int] = [] use_connector = self.connector is not None @@ -655,7 +654,8 @@ class Scheduler(SchedulerInterface): # out of bounds errors. TODO: Remove this once the KVConnector # is updated to handle token IDs properly. new_token_ids.append([]) - new_block_ids.append(req_to_new_block_ids[req_id]) + new_block_ids.append( + req_to_new_blocks[req_id].get_block_ids(allow_none=True)) num_computed_tokens.append(req.num_computed_tokens) # Because resumed_reqs is usually empty, it is more efficient to do # in-place appending so that we don't need to allocate a new list. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 33747d6917..cc86f98264 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -574,11 +574,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Update the block IDs. if not resumed_from_preemption: - # Append the new blocks to the existing block IDs. - for block_ids, new_ids in zip(req_state.block_ids, - new_block_ids): - block_ids.extend(new_ids) + if new_block_ids is not None: + # Append the new blocks to the existing block IDs. + for block_ids, new_ids in zip(req_state.block_ids, + new_block_ids): + block_ids.extend(new_ids) else: + assert new_block_ids is not None # The request is resumed from preemption. # Replace the existing block IDs with the new ones. req_state.block_ids = new_block_ids @@ -594,7 +596,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Update the persistent batch. self.input_batch.num_computed_tokens_cpu[req_index] = ( num_computed_tokens) - self.input_batch.block_table.append_row(new_block_ids, req_index) + if new_block_ids is not None: + self.input_batch.block_table.append_row( + new_block_ids, req_index) # For the last rank, we don't need to update the token_ids_cpu # because the sampled tokens are already cached. diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 9196c62377..0f569500cd 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -418,11 +418,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Update the cached states. req_state.num_computed_tokens = num_computed_tokens if not resumed_from_preemption: - # Append the new blocks to the existing block IDs. - for block_ids, new_ids in zip(req_state.block_ids, - new_block_ids): - block_ids.extend(new_ids) + if new_block_ids is not None: + # Append the new blocks to the existing block IDs. + for block_ids, new_ids in zip(req_state.block_ids, + new_block_ids): + block_ids.extend(new_ids) else: + assert new_block_ids is not None # The request is resumed from preemption. # Replace the existing block IDs with the new ones. req_state.block_ids = new_block_ids @@ -438,7 +440,9 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Update the persistent batch. self.input_batch.num_computed_tokens_cpu[req_index] = ( num_computed_tokens) - self.input_batch.block_table.append_row(new_block_ids, req_index) + if new_block_ids is not None: + self.input_batch.block_table.append_row( + new_block_ids, req_index) # Add the new or resumed requests to the persistent batch. # The smaller empty indices are filled first. From 7be5d113d8784536b79f27f24cfa91958dc291b0 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Thu, 21 Aug 2025 09:34:24 +0800 Subject: [PATCH 453/932] [CPU] Refactor CPU W8A8 scaled_mm (#23071) Signed-off-by: jiang1.li --- .../scripts/hardware_ci/run-cpu-test.sh | 7 +- cmake/cpu_extension.cmake | 59 +- csrc/cpu/cpu_types_x86.hpp | 8 +- csrc/cpu/dnnl_helper.cpp | 346 +++++++ csrc/cpu/dnnl_helper.h | 169 ++++ csrc/cpu/dnnl_helper.hpp | 206 ---- csrc/cpu/dnnl_kernels.cpp | 494 +++++++++ csrc/cpu/quant.cpp | 951 ------------------ csrc/cpu/torch_bindings.cpp | 92 +- tests/kernels/test_onednn.py | 144 +++ vllm/_custom_ops.py | 83 ++ vllm/model_executor/layers/fused_moe/layer.py | 11 +- vllm/model_executor/layers/linear.py | 8 +- .../kernels/scaled_mm/__init__.py | 4 +- .../quantization/kernels/scaled_mm/cpu.py | 206 ++++ .../quantization/kernels/scaled_mm/cutlass.py | 4 +- vllm/model_executor/layers/utils.py | 6 + 17 files changed, 1525 insertions(+), 1273 deletions(-) create mode 100644 csrc/cpu/dnnl_helper.cpp create mode 100644 csrc/cpu/dnnl_helper.h delete mode 100644 csrc/cpu/dnnl_helper.hpp create mode 100644 csrc/cpu/dnnl_kernels.cpp delete mode 100644 csrc/cpu/quant.cpp create mode 100644 tests/kernels/test_onednn.py create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 57a7bc4e5f..9dec9f8e9e 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -46,6 +46,11 @@ function cpu_tests() { set -e python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" + # Run kernel tests + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pytest -v -s tests/kernels/test_onednn.py" + # Run basic model test docker exec cpu-test-"$NUMA_NODE" bash -c " set -e @@ -99,4 +104,4 @@ function cpu_tests() { # All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" +timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index e0da46e2ac..cc38cd41a5 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -182,17 +182,17 @@ endif() # # Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms) # Flag to enable ACL kernels for AARCH64 platforms -if ( VLLM_BUILD_ACL STREQUAL "ON") +if (VLLM_BUILD_ACL STREQUAL "ON") set(USE_ACL ON) else() set(USE_ACL OFF) endif() -if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND) +if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) FetchContent_Declare( oneDNN GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git - GIT_TAG v3.8.1 + GIT_TAG v3.9 GIT_PROGRESS TRUE GIT_SHALLOW TRUE ) @@ -204,7 +204,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND) endif() set(ONEDNN_AARCH64_USE_ACL "ON") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/") - endif() + endif() set(ONEDNN_LIBRARY_TYPE "STATIC") set(ONEDNN_BUILD_DOC "OFF") @@ -217,38 +217,23 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND) set(ONEDNN_ENABLE_ITT_TASKS "OFF") set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF") set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF") + set(ONEDNN_VERBOSE "OFF") set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) FetchContent_MakeAvailable(oneDNN) - - list(APPEND LIBS dnnl) -elseif(POWER10_FOUND) - FetchContent_Declare( - oneDNN - GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git - GIT_TAG v3.7.2 - GIT_PROGRESS TRUE - GIT_SHALLOW TRUE + add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp") + target_include_directories( + dnnl_ext + PUBLIC ${oneDNN_SOURCE_DIR}/include + PUBLIC ${oneDNN_BINARY_DIR}/include + PRIVATE ${oneDNN_SOURCE_DIR}/src ) - - set(ONEDNN_LIBRARY_TYPE "STATIC") - set(ONEDNN_BUILD_DOC "OFF") - set(ONEDNN_BUILD_EXAMPLES "OFF") - set(ONEDNN_BUILD_TESTS "OFF") - set(ONEDNN_ENABLE_WORKLOAD "INFERENCE") - set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER") - set(ONEDNN_BUILD_GRAPH "OFF") - set(ONEDNN_ENABLE_JIT_PROFILING "OFF") - set(ONEDNN_ENABLE_ITT_TASKS "OFF") - set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF") - set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF") - set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) - - set(DNNL_CPU_RUNTIME "OMP") - - FetchContent_MakeAvailable(oneDNN) - - list(APPEND LIBS dnnl) + target_link_libraries(dnnl_ext dnnl) + target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC) + list(APPEND LIBS dnnl_ext) + set(USE_ONEDNN ON) +else() + set(USE_ONEDNN OFF) endif() message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") @@ -275,7 +260,6 @@ set(VLLM_EXT_SRC if (AVX512_FOUND AND NOT AVX512_DISABLED) set(VLLM_EXT_SRC - "csrc/cpu/quant.cpp" "csrc/cpu/shm.cpp" ${VLLM_EXT_SRC}) if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI) @@ -289,14 +273,11 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED) ${VLLM_EXT_SRC}) add_compile_definitions(-DCPU_CAPABILITY_AVX512) endif() -elseif(POWER10_FOUND) - set(VLLM_EXT_SRC - "csrc/cpu/quant.cpp" - ${VLLM_EXT_SRC}) endif() -if (ASIMD_FOUND) + +if(USE_ONEDNN) set(VLLM_EXT_SRC - "csrc/cpu/quant.cpp" + "csrc/cpu/dnnl_kernels.cpp" ${VLLM_EXT_SRC}) endif() diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index 3952c43cbc..982f7c07a1 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -89,7 +89,7 @@ struct FP16Vec16 : public Vec { explicit FP16Vec16(const FP32Vec16&); - void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } + void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -126,7 +126,7 @@ struct BF16Vec16 : public Vec { explicit BF16Vec16(const FP32Vec16&); - void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } + void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -180,8 +180,8 @@ struct BF16Vec32 : public Vec { (__m128i)vec8_data.reg, 1)) {} void save(void* ptr) const { - *reinterpret_cast<__m256i*>(ptr) = reg_low; - *reinterpret_cast<__m256i*>((__m256i*)ptr + 1) = reg_high; + _mm256_storeu_si256((__m256i*)ptr, reg_low); + _mm256_storeu_si256((__m256i*)ptr + 1, reg_high); } }; #endif diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp new file mode 100644 index 0000000000..f3f00edb36 --- /dev/null +++ b/csrc/cpu/dnnl_helper.cpp @@ -0,0 +1,346 @@ +#include +#include + +#include "common/memory_desc.hpp" +#include "common/memory.hpp" + +#include "dnnl_helper.h" + +static dnnl::engine& default_engine() { + static dnnl::engine engine(dnnl::engine::kind::cpu, 0); + return engine; +} + +static dnnl::stream& default_stream() { + static dnnl::stream stream(default_engine()); + return stream; +} + +void release_dnnl_matmul_handler(int64_t handler) { + DNNLMatMulPrimitiveHandler* ptr = + reinterpret_cast(handler); + delete ptr; +} + +template +class DNNLPrimitiveCache { + public: + using cache_value_t = std::pair; + using result_value_t = VT; + using container_t = std::list; + using value_iterator_t = typename container_t::iterator; + using map_t = std::unordered_map; + using creator_t = VT (*)(); + + public: + DNNLPrimitiveCache(size_t capacity) + : capacity_(capacity), + values_(), + key_to_value_(std::min(256lu, capacity)) { + assert(capacity > 0); + } + + template + result_value_t get_or_create(const KT& key, F&& creator) { + std::optional value = get_value(key); + if (value.has_value()) { + return value.value()->second; + } else { + return add_value({key, creator()})->second; + } + } + + size_t size() const { return values_.size(); } + + private: + void dump_data() { + std::stringstream ss; + ss << "table_id: " << std::hex << reinterpret_cast(this) << std::dec + << "\n"; + ss << "container: ["; + for (auto&& iter : values_) { + ss << "(" << iter.first << ", " << std::hex + << reinterpret_cast(iter.second.get()) << "), " << std::dec; + } + ss << "]\n"; + + ss << "map: ["; + for (auto&& iter : key_to_value_) { + ss << "(" << iter.first << ", " << iter.second->first << ", " << std::hex + << reinterpret_cast(iter.second->second.get()) << std::dec + << "), "; + } + ss << "]\n"; + std::printf("%s\n", ss.str().c_str()); + } + + value_iterator_t add_value(cache_value_t&& new_value) { + if (size() == capacity_) { + cache_value_t& last_item = values_.back(); + key_to_value_.erase(last_item.first); + values_.pop_back(); + } + + auto& added_value_ = values_.emplace_front(std::move(new_value)); + key_to_value_.emplace(added_value_.first, values_.begin()); + return values_.begin(); + } + + std::optional get_value(const KT& key) { + if (key_to_value_.size() > 0 && key == values_.begin()->first) { + return values_.begin(); + } + + auto value_map_iterator = key_to_value_.find(key); + if (value_map_iterator != key_to_value_.end()) { + values_.splice(values_.begin(), values_, value_map_iterator->second); + return value_map_iterator->second; + } else { + return {}; + } + } + + private: + const size_t capacity_; + container_t values_; + map_t key_to_value_; +}; + +DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler( + const Args& args, dnnl::memory::data_type b_type) + : b_n_size_(args.b_n_size), + b_n_stride_(args.b_n_stride), + b_k_size_(args.b_k_size), + b_k_stride_(args.b_k_stride), + b_type_(b_type), + c_type_(args.c_type), + runtime_memory_ptrs_(8), + primitive_cache_size_(args.primitive_cache_size) { + assert(primitive_cache_size_ > 0); +} + +void DNNLMatMulPrimitiveHandler::prepack_weight( + void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) { + dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_, + {b_k_stride_, b_n_stride_}); + dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr); + dnnl::memory packed_weight(b_target_mem_desc, default_engine()); + { + dnnl::reorder(original_weight, packed_weight) + .execute(default_stream(), original_weight, packed_weight); + default_stream().wait(); + } + memory_cache_[DNNL_ARG_WEIGHTS] = packed_weight; + b_target_mem_desc_ = b_target_mem_desc; +} + +void DNNLMatMulPrimitiveHandler::set_runtime_memory_ptr( + size_t index, dnnl_memory* memory_ptr) { + dnnl::impl::memory_storage_t* mem_storage_ptr = memory_ptr->memory_storage(); + dnnl_memory_desc* mem_desc = const_cast(memory_ptr->md()); + runtime_memory_ptrs_[index] = {mem_storage_ptr, mem_desc}; +} + +std::pair +DNNLMatMulPrimitiveHandler::get_runtime_memory_ptr(size_t index) { + return runtime_memory_ptrs_[index]; +} + +namespace std { +template <> +struct hash { + size_t operator()( + const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const { + return hash()(val.b_n_size) ^ hash()(val.b_k_size) ^ + hash()(static_cast(val.a_qs)) ^ + hash()(static_cast(val.b_qs)) ^ hash()(val.use_azp) ^ + hash()(static_cast(val.c_type)); + } +}; + +template <> +struct hash { + size_t operator()( + const W8A8MatMulPrimitiveHandler::MSizeCacheKey& val) const { + return hash()(val.a_m_size) ^ hash()(val.use_bias) ^ + hash()(static_cast(val.bias_type)); + } +}; +} // namespace std + +bool operator==(const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& l, + const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& r) { + return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size && + l.a_qs == r.a_qs && l.b_qs == r.b_qs && l.use_azp == r.use_azp && + l.c_type == r.c_type; +} + +bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l, + const W8A8MatMulPrimitiveHandler::MSizeCacheKey& r) { + return l.use_bias == r.use_bias && l.a_m_size == r.a_m_size && + l.bias_type == r.bias_type; +} + +static std::shared_ptr +get_w8a8_class_primitive_cache( + const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& key, + int64_t cache_size) { + static W8A8MatMulPrimitiveHandler::ClassMatmulCache cache(128); + assert(cache_size > 0); + return cache.get_or_create(key, [&]() { + return std::make_shared(cache_size); + }); +} + +W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args) + : DNNLMatMulPrimitiveHandler( + static_cast(args), + dnnl::memory::data_type::s8), + use_azp_(args.use_a_zero_point), + a_qs_(args.a_quantization_strategy), + b_qs_(args.b_quantization_strategy), + m_size_cache_(nullptr) { + assert(a_qs_ != QuantizationStrategy::PER_OUTPUT_CHANNEL); + assert(b_qs_ != QuantizationStrategy::PER_TOKEN); + if (a_qs_ == QuantizationStrategy::PER_TOKEN) { + assert(!use_azp_); + }; + prepack_weight(args.b_ptr, + create_primitive_desc( + MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL, + .use_bias = false, + .bias_type = dnnl::memory::data_type::undef}, + true) + .weights_desc()); + init_runtime_memory_cache(args); +} + +void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) { + auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0); + auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1); + a_storage->set_data_handle((void*)args.a_ptr); + a_mem_desc->dims[0] = args.a_m_size; + c_storage->set_data_handle((void*)args.c_ptr); + c_mem_desc->dims[0] = args.a_m_size; + + if (a_qs_ == QuantizationStrategy::PER_TENSOR) { + auto&& [a_scale_storage, a_scale_mem_desc] = get_runtime_memory_ptr(2); + a_scale_storage->set_data_handle((void*)args.a_scales_ptr); + } + if (use_azp_) { + auto&& [a_zero_point_storage, a_zero_point_mem_desc] = + get_runtime_memory_ptr(3); + a_zero_point_storage->set_data_handle((void*)args.a_zero_points_ptr); + } + + if (args.use_bias) { + auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(4); + bias_storage->set_data_handle((void*)args.bias_ptr); + } + + dnnl::matmul matmul = get_matmul_cache(args); + matmul.execute(default_stream(), memory_cache_); + default_stream().wait(); +} + +dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache( + const MSizeCacheKey& key) { + if (m_size_cache_.get() == nullptr) { + ClassMatmulCacheKey key = {.b_n_size = b_n_size_, + .b_k_size = b_k_size_, + .a_qs = a_qs_, + .b_qs = b_qs_, + .use_azp = use_azp_, + .c_type = c_type_}; + m_size_cache_ = get_w8a8_class_primitive_cache(key, primitive_cache_size_); + } + + return m_size_cache_->get_or_create(key, [&]() { + dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false); + return dnnl::matmul(desc); + }); +} + +void W8A8MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) { + memory_cache_[DNNL_ARG_SRC] = dnnl::memory({{1, b_k_size_}, + dnnl::memory::data_type::s8, + dnnl::memory::format_tag::ab}, + default_engine(), nullptr); + set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get()); + memory_cache_[DNNL_ARG_DST] = + dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab}, + default_engine(), nullptr); + set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get()); + + // For PER_TOKEN, scales will be applied in outside epilogue + if (a_qs_ == QuantizationStrategy::PER_TENSOR) { + memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC] = dnnl::memory( + {{1}, dnnl::memory::data_type::f32, {1}}, default_engine(), nullptr); + set_runtime_memory_ptr( + 2, memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC].get()); + if (use_azp_) { + memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC] = dnnl::memory( + {{1}, dnnl::memory::data_type::s32, {1}}, default_engine(), nullptr); + set_runtime_memory_ptr( + 3, memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC].get()); + } + } + + if (b_qs_ == QuantizationStrategy::PER_TENSOR) { + memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = + dnnl::memory({{1}, dnnl::memory::data_type::f32, {1}}, default_engine(), + (void*)args.b_scales_ptr); + } else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) { + memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = + dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, + default_engine(), (void*)args.b_scales_ptr); + } + + memory_cache_[DNNL_ARG_BIAS] = + dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, + default_engine(), nullptr); + set_runtime_memory_ptr(4, memory_cache_[DNNL_ARG_BIAS].get()); +} + +dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc( + const MSizeCacheKey& key, bool first_time) { + dnnl::memory::desc a_md({key.a_m_size, b_k_size_}, + dnnl::memory::data_type::s8, + dnnl::memory::format_tag::ab); + dnnl::memory::desc b_md; + if (first_time) { + b_md = + dnnl::memory::desc({b_k_size_, b_n_size_}, dnnl::memory::data_type::s8, + dnnl::memory::format_tag::any); + } else { + b_md = b_target_mem_desc_; + } + dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_, + dnnl::memory::format_tag::ab); + + dnnl::primitive_attr attr; + // For PER_TOKEN, scales will be applied in outside epilogue + if (a_qs_ == QuantizationStrategy::PER_TENSOR) { + attr.set_scales_mask(DNNL_ARG_SRC, 0); + if (use_azp_) { + attr.set_zero_points_mask(DNNL_ARG_SRC, 0); + } + } + + if (b_qs_ == QuantizationStrategy::PER_TENSOR) { + attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0); + } else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) { + attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2); + } + + if (key.use_bias) { + // For PER_TOKEN, bias will be applied in epilogue + assert(a_qs_ == QuantizationStrategy::PER_TENSOR); + dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1}); + return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md, + c_md, attr); + } else { + return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md, + attr); + } +} diff --git a/csrc/cpu/dnnl_helper.h b/csrc/cpu/dnnl_helper.h new file mode 100644 index 0000000000..54ceefced9 --- /dev/null +++ b/csrc/cpu/dnnl_helper.h @@ -0,0 +1,169 @@ +#ifndef DNNL_HELPER_H +#define DNNL_HELPER_H + +#include +#include + +#include "oneapi/dnnl/dnnl.hpp" + +namespace c10 { +struct BFloat16; +struct Half; +} // namespace c10 + +namespace dnnl { +namespace impl { +struct memory_storage_t; +struct matmul_pd_t; +struct matmul_desc_t; +} // namespace impl +} // namespace dnnl +struct dnnl_memory_desc; + +template +class DNNLPrimitiveCache; + +template +struct DNNLType { + static constexpr dnnl::memory::data_type type = + dnnl::memory::data_type::undef; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16; +}; + +template +constexpr inline dnnl::memory::data_type get_dnnl_type() { + return DNNLType>::type; +} + +class DNNLMatMulPrimitiveHandler { + public: + virtual ~DNNLMatMulPrimitiveHandler() = default; + + protected: + struct Args { + dnnl_dim_t b_n_size; + dnnl_dim_t b_n_stride; + dnnl_dim_t b_k_size; + dnnl_dim_t b_k_stride; + void* b_ptr; + dnnl::memory::data_type c_type; + size_t primitive_cache_size; + }; + + protected: + DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type); + + void prepack_weight(void* original_b_ptr, + dnnl::memory::desc b_target_mem_desc); + + void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr); + + std::pair + get_runtime_memory_ptr(size_t index); + + protected: + const dnnl_dim_t b_n_size_; + const dnnl_dim_t b_n_stride_; + const dnnl_dim_t b_k_size_; + const dnnl_dim_t b_k_stride_; + dnnl::memory::data_type b_type_; + dnnl::memory::data_type c_type_; + std::unordered_map memory_cache_; + std::vector> + runtime_memory_ptrs_; + dnnl::memory::desc b_target_mem_desc_; + int64_t primitive_cache_size_; +}; + +class W8A8MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler { + public: + enum class QuantizationStrategy { PER_TOKEN, PER_TENSOR, PER_OUTPUT_CHANNEL }; + + struct Args : public DNNLMatMulPrimitiveHandler::Args { + bool use_a_zero_point; + QuantizationStrategy a_quantization_strategy; + QuantizationStrategy b_quantization_strategy; + float* b_scales_ptr; + }; + + struct ClassMatmulCacheKey { + dnnl_dim_t b_n_size; + dnnl_dim_t b_k_size; + QuantizationStrategy a_qs; + QuantizationStrategy b_qs; + bool use_azp; + dnnl::memory::data_type c_type; + + friend bool operator==(const ClassMatmulCacheKey& l, + const ClassMatmulCacheKey& r); + }; + + struct MSizeCacheKey { + dnnl_dim_t a_m_size; + bool use_bias; + dnnl::memory::data_type bias_type; + + friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r); + }; + + using MSizeCache = DNNLPrimitiveCache; + using ClassMatmulCache = + DNNLPrimitiveCache>; + + struct ExecArgs : public MSizeCacheKey { + const int8_t* a_ptr; + const float* a_scales_ptr; + const int32_t* a_zero_points_ptr; + const void* bias_ptr; + void* c_ptr; + }; + + public: + W8A8MatMulPrimitiveHandler(const Args& args); + + QuantizationStrategy get_input_scale_strategy() const { return a_qs_; } + + bool get_input_use_zero_point() const { return use_azp_; } + + void execute(ExecArgs& args); + + private: + dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key, + bool first_time); + + void init_runtime_memory_cache(const Args& args); + + dnnl::matmul get_matmul_cache(const MSizeCacheKey& key); + + private: + const bool use_azp_; + const QuantizationStrategy a_qs_; + const QuantizationStrategy b_qs_; + std::shared_ptr m_size_cache_; +}; + +#endif diff --git a/csrc/cpu/dnnl_helper.hpp b/csrc/cpu/dnnl_helper.hpp deleted file mode 100644 index 1cb8dc5b25..0000000000 --- a/csrc/cpu/dnnl_helper.hpp +++ /dev/null @@ -1,206 +0,0 @@ -#ifndef DNNL_HELPER_HPP -#define DNNL_HELPER_HPP - -#include -#include - -#include "oneapi/dnnl/dnnl.hpp" - -namespace { -template -struct DNNLType { - static constexpr dnnl::memory::data_type type = - dnnl::memory::data_type::undef; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16; -}; - -template -constexpr inline dnnl::memory::data_type get_dnnl_type() { - return DNNLType>::type; -} -}; // namespace - -template -class DNNLPrimitiveHelper { - public: - // I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias) - // A: [M, K], row-major - // B: [K, N], column-major - // C: [M, N], row-major - // bias: [N], row-major, optional - // a_scales: [MS] - // b_scales: [NS] - // Note: Due to the limitation of oneDNN - // (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is - // not supported. - - template - static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c, - const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N, - dnnl_dim_t K, const float* a_scales, - const float* b_scales, dnnl_dim_t MS, - dnnl_dim_t NS) { - auto&& OutputType = get_dnnl_type(); - auto&& BiasType = get_dnnl_type(); - - dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1}); - dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K}); - dnnl::memory::desc c_md({M, N}, OutputType, {N, 1}); - - dnnl::primitive_attr attr; - if constexpr (!InputNoScale) { - if (MS == 1) { - // per-tensor - attr.set_scales_mask(DNNL_ARG_SRC, 0); - } else { - // per-token - TORCH_CHECK(false, "per-token quantization is unsupported."); - } - } - - if (NS == 1) { - // per-tensor - attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0); - } else { - // per-channel - attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2); - } - - dnnl::matmul::primitive_desc matmul_pd; -// Create memory descriptors with format_tag::any for the primitive. This -// enables the matmul primitive to choose memory layouts for an -// optimized primitive implementation, and these layouts may differ from the -// ones provided by the user. -#ifdef __aarch64__ - auto mat_src_md = dnnl::memory::desc({M, K}, dnnl::memory::data_type::s8, - dnnl::memory::format_tag::any); - auto mat_weights_md = dnnl::memory::desc( - {K, N}, dnnl::memory::data_type::s8, dnnl::memory::format_tag::any); - auto mat_dst_md = - dnnl::memory::desc({M, N}, OutputType, dnnl::memory::format_tag::any); - if (bias) { - dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1}); - matmul_pd = dnnl::matmul::primitive_desc(default_engine(), mat_src_md, - mat_weights_md, bias_md, - mat_dst_md, attr); - } else { - matmul_pd = dnnl::matmul::primitive_desc( - default_engine(), mat_src_md, mat_weights_md, mat_dst_md, attr); - } -#else - if (bias) { - dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1}); - matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, - bias_md, c_md, attr); - } else { - matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, - c_md, attr); - } -#endif - dnnl::matmul matmul(matmul_pd); - - auto& engine = default_engine(); - - dnnl::memory a_m(a_md, engine, (void*)a); - dnnl::memory b_m(b_md, engine, (void*)b); - dnnl::memory c_m(c_md, engine, (void*)c); - dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine, - (void*)a_scales); - dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine, - (void*)b_scales); - - auto& stream = default_stream(); - - auto mat_src_mem = a_m; - auto mat_weights_mem = b_m; - auto mat_dst_mem = c_m; -#ifdef __aarch64__ - if (matmul_pd.weights_desc() != b_m.get_desc()) { - mat_weights_mem = dnnl::memory(matmul_pd.weights_desc(), engine); - dnnl::reorder(b_m, mat_weights_mem).execute(stream, b_m, mat_weights_mem); - } -#endif - if constexpr (InputNoScale) { - if (bias) { - dnnl::memory::desc bias_md({N}, BiasType, {1}); - dnnl::memory bias_m(bias_md, engine, (void*)bias); - matmul.execute( - stream, { - {DNNL_ARG_SRC, mat_src_mem}, - {DNNL_ARG_WEIGHTS, mat_weights_mem}, - {DNNL_ARG_BIAS, bias_m}, - {DNNL_ARG_DST, mat_dst_mem}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m}, - }); - } else { - matmul.execute( - stream, { - {DNNL_ARG_SRC, mat_src_mem}, - {DNNL_ARG_WEIGHTS, mat_weights_mem}, - {DNNL_ARG_DST, mat_dst_mem}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m}, - }); - } - } else { - if (bias) { - dnnl::memory::desc bias_md({N}, BiasType, {1}); - dnnl::memory bias_m(bias_md, engine, (void*)bias); - matmul.execute( - stream, { - {DNNL_ARG_SRC, mat_src_mem}, - {DNNL_ARG_WEIGHTS, mat_weights_mem}, - {DNNL_ARG_BIAS, bias_m}, - {DNNL_ARG_DST, mat_dst_mem}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m}, - }); - } else { - matmul.execute( - stream, { - {DNNL_ARG_SRC, mat_src_mem}, - {DNNL_ARG_WEIGHTS, mat_weights_mem}, - {DNNL_ARG_DST, mat_dst_mem}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m}, - }); - } - } - stream.wait(); - } - - private: - static dnnl::engine& default_engine() { - static dnnl::engine engine(dnnl::engine::kind::cpu, 0); - return engine; - } - - static dnnl::stream& default_stream() { - static dnnl::stream stream(default_engine()); - return stream; - } -}; -#endif diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp new file mode 100644 index 0000000000..acc3b9ecde --- /dev/null +++ b/csrc/cpu/dnnl_kernels.cpp @@ -0,0 +1,494 @@ +#include "cpu_types.hpp" +#include "dnnl_helper.h" + +namespace { +template +struct KernelVecType { + using load_vec_type = void; + using cvt_vec_type = void; +}; + +template <> +struct KernelVecType { + using load_vec_type = vec_op::FP32Vec16; + using cvt_vec_type = vec_op::FP32Vec16; +}; + +#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT) +template <> +struct KernelVecType { + using load_vec_type = vec_op::BF16Vec16; + using cvt_vec_type = vec_op::FP32Vec16; +}; +#endif + +template <> +struct KernelVecType { +#if defined(__powerpc64__) || defined(__s390x__) + // Power architecture-specific vector type + using load_vec_type = vec_op::FP32Vec16; +#else + // Fallback for other architectures + using load_vec_type = vec_op::FP16Vec16; +#endif + using cvt_vec_type = vec_op::FP32Vec16; +}; + +template +void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, + const float* scale, const int32_t* azp, + const int64_t num_tokens, + const int64_t input_stride, + const int64_t hidden_size) { + using load_vec_t = typename KernelVecType::load_vec_type; + using cvt_vec_t = typename KernelVecType::cvt_vec_type; + constexpr int64_t vec_elem_num = load_vec_t::VEC_ELEM_NUM; + + constexpr float i8_min = + static_cast(std::numeric_limits::min()); + constexpr float i8_max = + static_cast(std::numeric_limits::max()); + const cvt_vec_t inv_scale(1.0 / *scale); + const cvt_vec_t i8_min_vec(i8_min); + const cvt_vec_t i8_max_vec(i8_max); + + cvt_vec_t zp_vec; + if constexpr (AZP) { + zp_vec = cvt_vec_t(static_cast(*azp)); + } + +#pragma omp parallel for + for (int64_t i = 0; i < num_tokens; ++i) { + int64_t j = 0; + const scalar_t* input_ptr = input + i * input_stride; + int8_t* output_ptr = output + i * hidden_size; + for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + elems_fp32 = elems_fp32 * inv_scale; + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + zp_vec; + } + + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output_ptr + j); + } + + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + elems_fp32 = elems_fp32 * inv_scale; + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + zp_vec; + } + + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output_ptr + j, hidden_size - j); + } +} + +template +void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, + float* scale, int32_t* azp, + const int64_t num_tokens, + const int64_t input_stride, + const int64_t hidden_size) { + using load_vec_t = typename KernelVecType::load_vec_type; + using cvt_vec_t = typename KernelVecType::cvt_vec_type; + constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; + + constexpr float i8_min = + static_cast(std::numeric_limits::min()); + constexpr float i8_max = + static_cast(std::numeric_limits::max()); + const cvt_vec_t i8_min_vec(i8_min); + const cvt_vec_t i8_max_vec(i8_max); + +#pragma omp parallel for + for (int64_t i = 0; i < num_tokens; ++i) { + cvt_vec_t max_value(std::numeric_limits::lowest()); + cvt_vec_t min_value(std::numeric_limits::max()); + { + int64_t j = 0; + const scalar_t* input_ptr = input + i * input_stride; + for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + if constexpr (AZP) { + max_value = max_value.max(elems_fp32); + min_value = min_value.min(elems_fp32); + } else { + max_value = max_value.max(elems_fp32.abs()); + } + } + + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + + if (j + vec_elem_num == hidden_size) { + if constexpr (AZP) { + max_value = max_value.max(elems_fp32); + min_value = min_value.min(elems_fp32); + } else { + max_value = max_value.max(elems_fp32.abs()); + } + } else { + if constexpr (AZP) { + max_value = max_value.max(elems_fp32, hidden_size - j); + min_value = min_value.min(elems_fp32, hidden_size - j); + } else { + max_value = max_value.max(elems_fp32.abs(), hidden_size - j); + } + } + } + + float scale_val, azp_val; + if constexpr (AZP) { + float max_scalar = max_value.reduce_max(); + float min_scalar = min_value.reduce_min(); + scale_val = (max_scalar - min_scalar) / 255.0f; + azp_val = std::nearbyint(-128.0f - min_scalar / scale_val); + azp[i] = azp_val; + scale[i] = scale_val; + } else { + scale_val = max_value.reduce_max() / 127.0f; + scale[i] = scale_val; + } + + const cvt_vec_t inv_scale(1.0 / scale_val); + const cvt_vec_t azp_vec(azp_val); + + { + int64_t j = 0; + const scalar_t* input_ptr = input + i * input_stride; + int8_t* output_ptr = output + i * hidden_size; + for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + elems_fp32 = (elems_fp32 * inv_scale); + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + azp_vec; + } + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output_ptr + j); + } + + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + elems_fp32 = (elems_fp32 * inv_scale); + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + azp_vec; + } + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output_ptr + j, hidden_size - j); + } + } +} + +template +void dynamic_quant_epilogue(const float* input, scalar_t* output, + const float* a_scale, const int32_t* azp, + const float* azp_adj, const scalar_t* bias, + const int64_t num_tokens, + const int64_t hidden_size) { + CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue) + using load_vec_t = typename KernelVecType::load_vec_type; + using cvt_vec_t = typename KernelVecType::cvt_vec_type; + constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; + + const int64_t thread_num = omp_get_max_threads(); + if (num_tokens > thread_num) { +#pragma omp parallel for + for (int64_t i = 0; i < num_tokens; ++i) { + const float* input_ptr = input + i * hidden_size; + scalar_t* output_ptr = output + i * hidden_size; + int64_t j = 0; + cvt_vec_t token_scale_vec(a_scale[i]); + cvt_vec_t token_zp_scale_vec; + if constexpr (AZP) { + float zp_scale_val = a_scale[i] * static_cast(azp[i]); + token_zp_scale_vec = cvt_vec_t(zp_scale_val); + } + for (; j < hidden_size - vec_elem_num; ++j) { + cvt_vec_t elems_fp32(input_ptr + j); + elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + cvt_vec_t azp_adj_fp32(azp_adj + j); + elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec; + } + if constexpr (Bias) { + load_vec_t bias_vec(bias + j); + cvt_vec_t bias_vec_fp32(bias_vec); + elems_fp32 = elems_fp32 + bias_vec_fp32; + } + load_vec_t elems_out(elems_fp32); + elems_out.save(output_ptr + j); + } + cvt_vec_t elems_fp32(input_ptr + j); + elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + cvt_vec_t azp_adj_fp32(azp_adj + j); + elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec; + } + if constexpr (Bias) { + load_vec_t bias_vec(bias + j); + cvt_vec_t bias_vec_fp32(bias_vec); + elems_fp32 = elems_fp32 + bias_vec_fp32; + } + load_vec_t elems_out(elems_fp32); + elems_out.save(output_ptr + j, hidden_size - j); + } + } else { + const int64_t vec_iteration = + (hidden_size + vec_elem_num - 1) / vec_elem_num; + const int64_t vec_iteration_per_thread = + (vec_iteration + thread_num - 1) / thread_num; + const int64_t elem_num_per_thread = vec_iteration_per_thread * vec_elem_num; +#pragma omp parallel for schedule(static, 1) + for (int64_t i = 0; i < thread_num; ++i) { + const int64_t start = elem_num_per_thread * i; + const int64_t end = std::min(hidden_size, elem_num_per_thread + start); + for (int64_t j = 0; j < num_tokens; ++j) { + cvt_vec_t token_scale_vec(a_scale[j]); + cvt_vec_t token_zp_scale_vec; + if constexpr (AZP) { + float zp_scale_val = a_scale[j] * static_cast(azp[j]); + token_zp_scale_vec = cvt_vec_t(zp_scale_val); + } + int64_t k = start; + const float* input_ptr = input + j * hidden_size; + scalar_t* output_ptr = output + j * hidden_size; + for (; k < end - vec_elem_num; k += vec_elem_num) { + cvt_vec_t elems_fp32(input_ptr + k); + elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + cvt_vec_t azp_adj_fp32(azp_adj + k); + elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec; + } + if constexpr (Bias) { + load_vec_t bias_vec(bias + k); + cvt_vec_t bias_vec_fp32(bias_vec); + elems_fp32 = elems_fp32 + bias_vec_fp32; + } + load_vec_t elems_out(elems_fp32); + elems_out.save(output_ptr + k); + } + if (k < end) { + cvt_vec_t elems_fp32(input_ptr + k); + elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + cvt_vec_t azp_adj_fp32(azp_adj + k); + elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec; + } + if constexpr (Bias) { + load_vec_t bias_vec(bias + k); + cvt_vec_t bias_vec_fp32(bias_vec); + elems_fp32 = elems_fp32 + bias_vec_fp32; + } + load_vec_t elems_out(elems_fp32); + elems_out.save(output_ptr + k, end - k); + } + } + } + } +} +} // namespace + +int64_t create_onednn_scaled_mm_handler( + const torch::Tensor& b, // [IC, OC], column-major + const torch::Tensor& b_scales, // [1] or [OC] + at::ScalarType output_type, bool dynamic_act_quant, bool use_azp, + int64_t primitive_cache_size) { + TORCH_CHECK(b.dim() == 2); + TORCH_CHECK(b.stride(0) == 1); // Column-major + TORCH_CHECK(b_scales.is_contiguous()); + + W8A8MatMulPrimitiveHandler::Args args; + args.primitive_cache_size = primitive_cache_size; + + if (b_scales.numel() == 1) { + args.b_quantization_strategy = + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR; + } else { + TORCH_CHECK_EQ(b_scales.numel(), b.size(1)); + args.b_quantization_strategy = + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_OUTPUT_CHANNEL; + } + args.b_scales_ptr = b_scales.data_ptr(); + args.b_k_size = b.size(0); + args.b_k_stride = b.stride(0); + args.b_n_size = b.size(1); + args.b_n_stride = b.stride(1); + args.b_ptr = b.data_ptr(); + + if (dynamic_act_quant) { + // dynamic per-token, bias, A scales and A zps will be applied in outside. + args.a_quantization_strategy = + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TOKEN; + args.use_a_zero_point = false; + } else { + // static per-tensor + args.a_quantization_strategy = + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR; + args.use_a_zero_point = use_azp; + } + + VLLM_DISPATCH_FLOATING_TYPES(output_type, "create_onednn_scaled_mm_handler", + [&] { + if (dynamic_act_quant) { + args.c_type = get_dnnl_type(); + } else { + args.c_type = get_dnnl_type(); + } + }); + + return reinterpret_cast(new W8A8MatMulPrimitiveHandler(args)); +} + +void onednn_scaled_mm( + torch::Tensor& c, // [M, OC], row-major + const torch::Tensor& a, // [M, IC], row-major + const torch::Tensor& a_scales, // [M] or [1] + const std::optional& azp, // [M] or [1] + const std::optional& azp_adj, // [M] or [1] + const std::optional& bias, // [N] + int64_t handler) { + CPU_KERNEL_GUARD_IN(onednn_scaled_mm) + TORCH_CHECK(a.dim() == 2); + TORCH_CHECK(a.is_contiguous()); + TORCH_CHECK(c.is_contiguous()); + W8A8MatMulPrimitiveHandler* ptr = + reinterpret_cast(handler); + const int32_t* azp_ptr = nullptr; + if (azp.has_value()) { + azp_ptr = azp->data_ptr(); + } + if (ptr->get_input_scale_strategy() == + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR) { + TORCH_CHECK_EQ(a_scales.numel(), 1); + } + + W8A8MatMulPrimitiveHandler::ExecArgs exec_args; + exec_args.a_ptr = a.data_ptr(); + exec_args.a_m_size = a.size(0); + exec_args.bias_ptr = nullptr; + exec_args.use_bias = false; + exec_args.a_scales_ptr = nullptr; + exec_args.a_zero_points_ptr = nullptr; + + VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "onednn_scaled_mm", [&] { + if (ptr->get_input_scale_strategy() == + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR) { + if (bias.has_value()) { + exec_args.bias_ptr = bias->data_ptr(); + exec_args.bias_type = get_dnnl_type(); + exec_args.use_bias = true; + } + exec_args.a_scales_ptr = a_scales.data_ptr(); + exec_args.a_zero_points_ptr = azp_ptr; + exec_args.c_ptr = c.data_ptr(); + ptr->execute(exec_args); + } else if (ptr->get_input_scale_strategy() == + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TOKEN) { + torch::Tensor tmp_fp32_out = + torch::empty_like(c, ::at::ScalarType::Float); + exec_args.c_ptr = tmp_fp32_out.data_ptr(); + ptr->execute(exec_args); + if (bias.has_value()) { + if (azp.has_value()) { + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), azp_ptr, azp_adj->data_ptr(), + bias->data_ptr(), c.size(0), c.size(1)); + } else { + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), azp_ptr, nullptr, + bias->data_ptr(), c.size(0), c.size(1)); + } + } else { + if (azp.has_value()) { + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), azp_ptr, azp_adj->data_ptr(), + (scalar_t*)nullptr, c.size(0), c.size(1)); + } else { + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), azp_ptr, nullptr, (scalar_t*)nullptr, + c.size(0), c.size(1)); + } + } + } else { + TORCH_CHECK(false, "invalid act quant type."); + } + }); +} + +// static-per-tensor quantization. +void static_scaled_int8_quant( + torch::Tensor& out, // [batch, hidden_size] + const torch::Tensor& input, // [batch, hidden_size] + const torch::Tensor& scale, std::optional const& azp) { + CPU_KERNEL_GUARD_IN(static_scaled_int8_quant) + TORCH_CHECK(out.is_contiguous()); + TORCH_CHECK_EQ(input.dim(), 2); + TORCH_CHECK_EQ(input.stride(1), 1); + TORCH_CHECK(scale.numel() == 1); + TORCH_CHECK(!azp.has_value() || azp->numel() == 1); + + const int64_t stride = input.stride(0); + const int64_t hidden_size = input.size(1); + const int64_t num_tokens = input.size(0); + VLLM_DISPATCH_FLOATING_TYPES( + input.scalar_type(), "static_scaled_int8_quant_impl", [&] { + if (azp.has_value()) { + static_scaled_int8_quant_impl( + input.data_ptr(), out.data_ptr(), + scale.data_ptr(), azp->data_ptr(), num_tokens, + stride, hidden_size); + } else { + static_scaled_int8_quant_impl(input.data_ptr(), + out.data_ptr(), + scale.data_ptr(), nullptr, + num_tokens, stride, hidden_size); + } + }); +} + +// dynamic-per-token quantization. +void dynamic_scaled_int8_quant( + torch::Tensor& out, // [batch, hidden_size] + const torch::Tensor& input, // [batch, hidden_size] + torch::Tensor& scale, // [batch, 1] + std::optional const& azp) { + CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant) + TORCH_CHECK(out.is_contiguous()); + TORCH_CHECK_EQ(input.dim(), 2); + TORCH_CHECK_EQ(input.stride(1), 1); + + const int64_t hidden_size = input.size(1); + const int64_t num_tokens = input.size(0); + const int64_t stride = input.stride(0); + VLLM_DISPATCH_FLOATING_TYPES( + input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] { + if (azp.has_value()) { + dynamic_scaled_int8_quant_impl( + input.data_ptr(), out.data_ptr(), + scale.data_ptr(), azp->data_ptr(), num_tokens, + stride, hidden_size); + } else { + dynamic_scaled_int8_quant_impl( + input.data_ptr(), out.data_ptr(), + scale.data_ptr(), nullptr, num_tokens, stride, + hidden_size); + } + }); +} diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp deleted file mode 100644 index 6e120b8d20..0000000000 --- a/csrc/cpu/quant.cpp +++ /dev/null @@ -1,951 +0,0 @@ -#include "cpu_types.hpp" -#include "dnnl_helper.hpp" - -namespace { -template -struct KernelVecType { - using load_vec_type = void; - using azp_adj_load_vec_type = void; - using cvt_vec_type = void; -}; - -template <> -struct KernelVecType { - using load_vec_type = vec_op::FP32Vec16; - using azp_adj_load_vec_type = vec_op::INT32Vec16; - using cvt_vec_type = vec_op::FP32Vec16; -}; - -#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT) -template <> -struct KernelVecType { - using load_vec_type = vec_op::BF16Vec16; - using azp_adj_load_vec_type = vec_op::INT32Vec16; - using cvt_vec_type = vec_op::FP32Vec16; -}; -#endif - -template <> -struct KernelVecType { -#if defined(__powerpc64__) || defined(__s390x__) - // Power architecture-specific vector type - using load_vec_type = vec_op::FP32Vec16; -#else - // Fallback for other architectures - using load_vec_type = vec_op::FP16Vec16; -#endif - using azp_adj_load_vec_type = vec_op::INT32Vec16; - using cvt_vec_type = vec_op::FP32Vec16; -}; - -#if defined(__AVX512F__) || defined(__aarch64__) -template -void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - const float* scale, const int32_t* azp, - const int num_tokens, - const int hidden_size) { - using load_vec_t = typename KernelVecType::load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - constexpr float i8_min = - static_cast(std::numeric_limits::min()); - constexpr float i8_max = - static_cast(std::numeric_limits::max()); - const cvt_vec_t inv_scale(1.0 / *scale); - const cvt_vec_t i8_min_vec(i8_min); - const cvt_vec_t i8_max_vec(i8_max); - - cvt_vec_t zp_vec; - if constexpr (AZP) { - zp_vec = cvt_vec_t(static_cast(*azp)); - } - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = elems_fp32 * inv_scale; - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + zp_vec; - } - - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j); - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = elems_fp32 * inv_scale; - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + zp_vec; - } - - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j, hidden_size - j); - } -} - -template -void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - float* scale, int32_t* azp, - const int num_tokens, - const int hidden_size) { - using load_vec_t = typename KernelVecType::load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - constexpr float i8_min = - static_cast(std::numeric_limits::min()); - constexpr float i8_max = - static_cast(std::numeric_limits::max()); - const cvt_vec_t i8_min_vec(i8_min); - const cvt_vec_t i8_max_vec(i8_max); - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - cvt_vec_t max_value(std::numeric_limits::lowest()); - cvt_vec_t min_value(std::numeric_limits::max()); - { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - if constexpr (AZP) { - max_value = max_value.max(elems_fp32); - min_value = min_value.min(elems_fp32); - } else { - max_value = max_value.max(elems_fp32.abs()); - } - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - - if (j + vec_elem_num == hidden_size) { - if constexpr (AZP) { - max_value = max_value.max(elems_fp32); - min_value = min_value.min(elems_fp32); - } else { - max_value = max_value.max(elems_fp32.abs()); - } - } else { - if constexpr (AZP) { - max_value = max_value.max(elems_fp32, hidden_size - j); - min_value = min_value.min(elems_fp32, hidden_size - j); - } else { - max_value = max_value.max(elems_fp32.abs(), hidden_size - j); - } - } - } - - float scale_val, azp_val; - if constexpr (AZP) { - float max_scalar = max_value.reduce_max(); - float min_scalar = min_value.reduce_min(); - scale_val = (max_scalar - min_scalar) / 255.0f; - azp_val = std::nearbyint(-128.0f - min_scalar / scale_val); - azp[i] = static_cast(azp_val); - scale[i] = scale_val; - } else { - scale_val = max_value.reduce_max() / 127.0f; - scale[i] = scale_val; - } - - const cvt_vec_t inv_scale(1.0 / scale_val); - const cvt_vec_t azp_vec(azp_val); - - { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale); - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + azp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j); - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale); - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + azp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j, hidden_size - j); - } - } -} - -template -void static_quant_epilogue(const float* input, scalar_t* output, - const float a_scale, const float* b_scale, - const int32_t* azp_with_adj, const int num_tokens, - const int hidden_size) { - CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl) - using load_vec_t = typename KernelVecType::load_vec_type; - using azp_adj_load_vec_t = - typename KernelVecType::azp_adj_load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - cvt_vec_t a_scale_vec(a_scale); - cvt_vec_t b_scale_vec(*b_scale); - cvt_vec_t scale_vec = a_scale_vec * b_scale_vec; - - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - cvt_vec_t elems_fp32(input + i * hidden_size + j); - azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - - if constexpr (PerChannel) { - b_scale_vec = cvt_vec_t(b_scale + j); - scale_vec = b_scale_vec * a_scale_vec; - } - - elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j); - } - - cvt_vec_t elems_fp32(input + i * hidden_size + j); - azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - - if constexpr (PerChannel) { - b_scale_vec = cvt_vec_t(b_scale + j); - scale_vec = b_scale_vec * a_scale_vec; - } - - elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j, hidden_size - j); - } -} - -template -void dynamic_quant_epilogue(const float* input, scalar_t* output, - const float* a_scale, const float* b_scale, - const int32_t* azp, const int32_t* azp_adj, - const scalar_t* bias, const int num_tokens, - const int hidden_size) { - CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue) - using load_vec_t = typename KernelVecType::load_vec_type; - using azp_adj_load_vec_t = - typename KernelVecType::azp_adj_load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - int j = 0; - cvt_vec_t token_scale_vec(a_scale[i]); - cvt_vec_t token_zp_scale_vec; - if constexpr (AZP) { - float zp_scale_val = a_scale[i] * static_cast(azp[i]); - if constexpr (!PerChannel) { - zp_scale_val *= *b_scale; - } - token_zp_scale_vec = cvt_vec_t(zp_scale_val); - } - - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - cvt_vec_t elems_fp32(input + i * hidden_size + j); - elems_fp32 = elems_fp32 * token_scale_vec; - - if constexpr (AZP) { - azp_adj_load_vec_t azp_adj_vec(azp_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; - - if constexpr (PerChannel) { - cvt_vec_t b_scale_vec(b_scale + j); - azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; - } - - elems_fp32 = elems_fp32 - azp_adj_fp32; - } - - if constexpr (Bias) { - load_vec_t bias_vec(bias + j); - cvt_vec_t bias_vec_fp32(bias_vec); - elems_fp32 = elems_fp32 + bias_vec_fp32; - } - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j); - } - - cvt_vec_t elems_fp32(input + i * hidden_size + j); - elems_fp32 = elems_fp32 * token_scale_vec; - - if constexpr (AZP) { - azp_adj_load_vec_t azp_adj_vec(azp_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; - - if constexpr (PerChannel) { - cvt_vec_t b_scale_vec(b_scale + j); - azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; - } - - elems_fp32 = elems_fp32 - azp_adj_fp32; - } - - if constexpr (Bias) { - load_vec_t bias_vec(bias + j); - cvt_vec_t bias_vec_fp32(bias_vec); - elems_fp32 = elems_fp32 + bias_vec_fp32; - } - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j, hidden_size - j); - } -} -#elif defined(__powerpc64__) -template -void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - const float* scale, const int32_t* azp, - const int num_tokens, - const int hidden_size) { - using load_vec_t = typename KernelVecType::load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - constexpr float i8_min = - static_cast(std::numeric_limits::min()); - constexpr float i8_max = - static_cast(std::numeric_limits::max()); - - const cvt_vec_t inv_scale(1.0 / *scale); - const cvt_vec_t i8_min_vec(i8_min); - const cvt_vec_t i8_max_vec(i8_max); - - cvt_vec_t zp_vec; - if constexpr (AZP) { - zp_vec = cvt_vec_t(static_cast(*azp)); - } - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = elems_fp32 * inv_scale; - if constexpr (AZP) { - elems_fp32 = elems_fp32 + zp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j); - } - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = elems_fp32 * inv_scale; - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + zp_vec; - } - - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j, hidden_size - j); - } -} -template -void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - float* scale, int32_t* azp, - const int num_tokens, - const int hidden_size) { - using load_vec_t = typename KernelVecType::load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - constexpr float i8_min = - static_cast(std::numeric_limits::min()); - constexpr float i8_max = - static_cast(std::numeric_limits::max()); - const cvt_vec_t i8_min_vec(i8_min); - const cvt_vec_t i8_max_vec(i8_max); - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - cvt_vec_t max_value(std::numeric_limits::lowest()); - cvt_vec_t min_value(std::numeric_limits::max()); - { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - if constexpr (AZP) { - max_value = max_value.max(elems_fp32); - min_value = min_value.min(elems_fp32); - } else { - max_value = max_value.max(elems_fp32.abs()); - } - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - - if (j + vec_elem_num == hidden_size) { - if constexpr (AZP) { - max_value = max_value.max(elems_fp32); - min_value = min_value.min(elems_fp32); - } else { - max_value = max_value.max(elems_fp32.abs()); - } - } else { - if constexpr (AZP) { - max_value = max_value.max(elems_fp32, hidden_size - j); - min_value = min_value.min(elems_fp32, hidden_size - j); - } else { - max_value = max_value.max(elems_fp32.abs(), hidden_size - j); - } - } - } - - float scale_val, azp_val; - if constexpr (AZP) { - float max_scalar = max_value.reduce_max(); - float min_scalar = min_value.reduce_min(); - scale_val = (max_scalar - min_scalar) / 255.0f; - azp_val = std::nearbyint(-128.0f - min_scalar / scale_val); - azp[i] = static_cast(azp_val); - scale[i] = scale_val; - } else { - scale_val = max_value.reduce_max() / 127.0f; - scale[i] = scale_val; - } - - const cvt_vec_t inv_scale(1.0 / scale_val); - const cvt_vec_t azp_vec(azp_val); - - { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale); - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + azp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j); - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale); - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + azp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j, hidden_size - j); - } - } -} -template -void static_quant_epilogue(const float* input, scalar_t* output, - const float a_scale, const float* b_scale, - const int32_t* azp_with_adj, const int num_tokens, - const int hidden_size) { - CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl) - using load_vec_t = typename KernelVecType::load_vec_type; - using azp_adj_load_vec_t = - typename KernelVecType::azp_adj_load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - cvt_vec_t a_scale_vec(a_scale); - cvt_vec_t b_scale_vec(*b_scale); - cvt_vec_t scale_vec = a_scale_vec * b_scale_vec; - - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - cvt_vec_t elems_fp32(input + i * hidden_size + j); - azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - - if constexpr (PerChannel) { - b_scale_vec = cvt_vec_t(b_scale + j); - scale_vec = b_scale_vec * a_scale_vec; - } - elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j); - } - - cvt_vec_t elems_fp32(input + i * hidden_size + j); - azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - - if constexpr (PerChannel) { - b_scale_vec = cvt_vec_t(b_scale + j); - scale_vec = b_scale_vec * a_scale_vec; - } - - elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j, hidden_size - j); - } -} -template -void dynamic_quant_epilogue(const float* input, scalar_t* output, - const float* a_scale, const float* b_scale, - const int32_t* azp, const int32_t* azp_adj, - const scalar_t* bias, const int num_tokens, - const int hidden_size) { - CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue) - using load_vec_t = typename KernelVecType::load_vec_type; - using azp_adj_load_vec_t = - typename KernelVecType::azp_adj_load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - int j = 0; - cvt_vec_t token_scale_vec(a_scale[i]); - cvt_vec_t token_zp_scale_vec; - if constexpr (AZP) { - float zp_scale_val = a_scale[i] * static_cast(azp[i]); - if constexpr (!PerChannel) { - zp_scale_val *= *b_scale; - } - token_zp_scale_vec = cvt_vec_t(zp_scale_val); - } - - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - cvt_vec_t elems_fp32(input + i * hidden_size + j); - elems_fp32 = elems_fp32 * token_scale_vec; - - if constexpr (AZP) { - azp_adj_load_vec_t azp_adj_vec(azp_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; - - if constexpr (PerChannel) { - cvt_vec_t b_scale_vec(b_scale + j); - azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; - } - - elems_fp32 = elems_fp32 - azp_adj_fp32; - } - - if constexpr (Bias) { - load_vec_t bias_vec(bias + j); - cvt_vec_t bias_vec_fp32(bias_vec); - elems_fp32 = elems_fp32 + bias_vec_fp32; - } - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j); - } - - cvt_vec_t elems_fp32(input + i * hidden_size + j); - elems_fp32 = elems_fp32 * token_scale_vec; - - if constexpr (AZP) { - azp_adj_load_vec_t azp_adj_vec(azp_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; - - if constexpr (PerChannel) { - cvt_vec_t b_scale_vec(b_scale + j); - azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; - } - - elems_fp32 = elems_fp32 - azp_adj_fp32; - } - - if constexpr (Bias) { - load_vec_t bias_vec(bias + j); - cvt_vec_t bias_vec_fp32(bias_vec); - elems_fp32 = elems_fp32 + bias_vec_fp32; - } - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j, hidden_size - j); - } -} -#else -template -void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - const float* scale, const int32_t* azp, - const int num_tokens, - const int hidden_size) { - TORCH_CHECK(false, - "static_scaled_int8_quant_impl requires AVX512/powerpc64/AArch64 " - "support.") -} - -template -void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - float* scale, int32_t* azp, - const int num_tokens, - const int hidden_size) { - TORCH_CHECK(false, - "dynamic_scaled_int8_quant_impl requires " - "AVX512/powerpc64/AArch64 support.") -} - -template -void static_quant_epilogue(const float* input, scalar_t* output, - const float a_scale, const float* b_scale, - const int32_t* azp_with_adj, const int num_tokens, - const int hidden_size) { - TORCH_CHECK( - false, "static_quant_epilogue requires AVX512/powerpc64/AArch64 support.") -} - -template -void dynamic_quant_epilogue(const float* input, scalar_t* output, - const float* a_scale, const float* b_scale, - const int32_t* azp, const int32_t* azp_with_adj, - const scalar_t* bias, const int num_tokens, - const int hidden_size) { - TORCH_CHECK( - false, - "dynamic_quant_epilogue requires AVX512/powerpc64/AArch64 support.") -} -#endif -} // namespace - -void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major - const torch::Tensor& a, // [M, IC], row-major - const torch::Tensor& b, // [IC, OC], column-major - const torch::Tensor& a_scales, // [1] or [M] - const torch::Tensor& b_scales, // [1] or [OC] - const std::optional& bias // [OC] -) { - CPU_KERNEL_GUARD_IN(cutlass_scaled_mm) - // Checks for conformality - TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8, - "int8_scaled_mm only supports INT8 inputs.") - TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); - TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && - b.size(1) == c.size(1)); - TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0)); - TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1)); - - // Check for strides and alignment - TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major - TORCH_CHECK(b.stride(0) == 1); // Column-major - TORCH_CHECK(c.stride(0) % 16 == 0 && - b.stride(1) % 16 == 0); // 16 Byte Alignment - TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); - - if (bias) { - TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() && - bias->dim() == 1); - } - - VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm", [&] { - if (a_scales.numel() != 1) { - // per-token - // Note: oneDNN doesn't support per-token activation quantization - // Ideally we want to fuse the GEMM and the scale procedure with oneDNN - // JIT, the intermediate data is cached in registers or L1. But for now - // the oneDNN GEMM code generation only supports two quantization - // patterns: per-tensor or per-output-channel of weight. - // So we have to apply the per-token scale with a 'epilogue'. In C=s_a * - // s_b * (A@B) + bias, the C_inter = s_b * (A@B) is computed by oneDNN - // GEMM, then the per-token scale (and bias) is applied with the epilogue - // C=s_a * C_inter + bias. - torch::Tensor tmp_fp32_out = - torch::empty_like(c, ::at::ScalarType::Float); - // Compute C_inter=s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), - a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel()); - if (bias.has_value()) { - // Compute C=s_a * C_inter + bias - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), nullptr, nullptr, nullptr, - bias->data_ptr(), c.size(0), c.size(1)); - } else { - // Compute C=s_a * C_inter - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), nullptr, nullptr, nullptr, nullptr, - c.size(0), c.size(1)); - } - } else { - // per-tensor - if (bias.has_value()) { - // Compute C=s_a * s_b * (A@B) + bias - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), c.data_ptr(), - bias->data_ptr(), a.size(0), b.size(1), a.size(1), - a_scales.data_ptr(), b_scales.data_ptr(), - a_scales.numel(), b_scales.numel()); - } else { - // Compute C=s_a * s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), c.data_ptr(), - nullptr, a.size(0), b.size(1), a.size(1), - a_scales.data_ptr(), b_scales.data_ptr(), - a_scales.numel(), b_scales.numel()); - } - } - }); -} - -void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major - const torch::Tensor& a, // [M, IC], row-major - const torch::Tensor& b, // [IC, OC], column-major - const torch::Tensor& a_scales, // [1] or [M] - const torch::Tensor& b_scales, // [1] or [OC] - const torch::Tensor& azp_adj, // [OC] - const std::optional& azp, // [1] or [M] - const std::optional& bias // [OC] -) { - CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp) - // Checks for conformality - TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8, - "int8_scaled_mm_azp only supports INT8 inputs.") - TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); - TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && - b.size(1) == c.size(1)); - TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0)); - TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1)); - - // Check for strides and alignment - TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major - TORCH_CHECK(b.stride(0) == 1); // Column-major - TORCH_CHECK(c.stride(0) % 16 == 0 && - b.stride(1) % 16 == 0); // 16 Byte Alignment - TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); - - if (bias) { - TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous()); - } - if (azp) { - TORCH_CHECK(azp->numel() == a.size(0) && azp->is_contiguous()); - } - TORCH_CHECK(azp_adj.numel() == b.size(1) && azp_adj.is_contiguous()); - - // azp & bias types - TORCH_CHECK(azp_adj.dtype() == torch::kInt32); - TORCH_CHECK(!azp || azp->dtype() == torch::kInt32); - TORCH_CHECK(!bias || bias->dtype() == c.dtype(), - "currently bias dtype must match output dtype ", c.dtype()); - - VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_azp", [&] { - torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float); - if (a_scales.numel() != 1) { - // per-token - // Note: oneDNN doesn't support per-token activation quantization - // Compute C_inter=s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), - a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel()); - if (bias.has_value()) { - // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj + bias - if (b_scales.numel() != 1) { - // Per-Channel - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), b_scales.data_ptr(), - azp->data_ptr(), azp_adj.data_ptr(), - bias->data_ptr(), c.size(0), c.size(1)); - } else { - // Per-Tensor - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), b_scales.data_ptr(), - azp->data_ptr(), azp_adj.data_ptr(), - bias->data_ptr(), c.size(0), c.size(1)); - } - } else { - // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj - if (b_scales.numel() != 1) { - // Per-Channel - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), b_scales.data_ptr(), - azp->data_ptr(), azp_adj.data_ptr(), nullptr, - c.size(0), c.size(1)); - } else { - // Per-Tensor - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), b_scales.data_ptr(), - azp->data_ptr(), azp_adj.data_ptr(), nullptr, - c.size(0), c.size(1)); - } - } - } else { - // per-tensor - if (bias.has_value()) { - // Compute C_inter=s_a * s_b * (A@B) + bias - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), bias->data_ptr(), - a.size(0), b.size(1), a.size(1), a_scales.data_ptr(), - b_scales.data_ptr(), a_scales.numel(), b_scales.numel()); - } else { - // Compute C_inter=s_a * s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), - a.size(1), a_scales.data_ptr(), b_scales.data_ptr(), - a_scales.numel(), b_scales.numel()); - } - - // Compute C=C_inter - s_a * s_b * azp_adj - if (b_scales.numel() != 1) { - // Per-Channel - static_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - *a_scales.data_ptr(), b_scales.data_ptr(), - azp_adj.data_ptr(), a.size(0), b.size(1)); - } else { - // Per-Tensor - static_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - *a_scales.data_ptr(), b_scales.data_ptr(), - azp_adj.data_ptr(), a.size(0), b.size(1)); - } - } - }); -} - -// static-per-tensor quantization. -void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] - const torch::Tensor& input, // [..., hidden_size] - const torch::Tensor& scale, - std::optional const& azp) { - CPU_KERNEL_GUARD_IN(static_scaled_int8_quant) - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(out.is_contiguous()); - TORCH_CHECK(scale.numel() == 1); - TORCH_CHECK(!azp.has_value() || azp->numel() == 1); - - const int hidden_size = input.size(-1); - const int num_tokens = input.numel() / hidden_size; - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), "static_scaled_int8_quant_impl", [&] { - if (azp.has_value()) { - static_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), azp->data_ptr(), num_tokens, - hidden_size); - } else { - static_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), nullptr, num_tokens, hidden_size); - } - }); -} - -// dynamic-per-token quantization. -void dynamic_scaled_int8_quant( - torch::Tensor& out, // [..., hidden_size] - const torch::Tensor& input, // [..., hidden_size] - torch::Tensor& scale, // [..., 1] - std::optional const& azp) { - CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant) - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(out.is_contiguous()); - - int const hidden_size = input.size(-1); - int const num_tokens = input.numel() / hidden_size; - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] { - if (azp.has_value()) { - dynamic_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), azp->data_ptr(), num_tokens, - hidden_size); - } else { - dynamic_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), nullptr, num_tokens, hidden_size); - } - }); -} - -#if defined(__powerpc64__) -void int8_scaled_mm_ppc64le(torch::Tensor& c, // [M, OC], row-major - const torch::Tensor& a, // [M, IC], row-major - const torch::Tensor& b, // [IC, OC], column-major - const torch::Tensor& a_scales, - const torch::Tensor& b_scales, - const std::optional& bias // [OC] -) { - CPU_KERNEL_GUARD_IN(cutlass_scaled_mm) - // Checks for conformality - TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8, - "int8_scaled_mm_ppc64le only supports INT8 inputs."); - TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); - TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && - b.size(1) == c.size(1)); - // We dont need this - TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0)); - TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1)); - - // Check for strides and alignment - TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major - TORCH_CHECK(b.stride(0) == 1); // Column-major - TORCH_CHECK(c.stride(0) % 16 == 0 && - b.stride(1) % 16 == 0); // 16 Byte Alignment - TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); - - if (bias) { - TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() && - bias->dim() == 1); - } - VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_ppc64le", [&] { - torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float); - // Compute C_inter=s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), - a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel()); - if (bias.has_value()) { - // Compute C=s_a * C_inter + bias - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), nullptr, nullptr, nullptr, - bias->data_ptr(), c.size(0), c.size(1)); - } else { - // Compute C=s_a * C_inter - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), nullptr, nullptr, nullptr, nullptr, - c.size(0), c.size(1)); - } - }); -} - -#endif diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index b20a054648..c9f426bdf6 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -6,25 +6,20 @@ std::string init_cpu_threads_env(const std::string& cpu_ids); -void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a, - const torch::Tensor& b, const torch::Tensor& a_scales, - const torch::Tensor& b_scales, - const std::optional& bias); +void release_dnnl_matmul_handler(int64_t handler); -void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a, - const torch::Tensor& b, const torch::Tensor& a_scales, - const torch::Tensor& b_scales, - const torch::Tensor& azp_adj, - const std::optional& azp, - const std::optional& bias); +int64_t create_onednn_scaled_mm_handler(const torch::Tensor& b, + const torch::Tensor& b_scales, + at::ScalarType output_type, + bool dynamic_act_quant, bool use_azp, + int64_t primitive_cache_size); -#if defined(__powerpc64__) -void int8_scaled_mm_ppc64le(torch::Tensor& c, const torch::Tensor& a, - const torch::Tensor& b, - const torch::Tensor& a_scales, - const torch::Tensor& b_scales, - const std::optional& bias); -#endif +void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a, + const torch::Tensor& a_scales, + const std::optional& azp, + const std::optional& azp_adj, + const std::optional& bias, + int64_t handler); void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query, torch::Tensor& kv_cache, double scale, @@ -151,8 +146,25 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding); // Quantization -#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) +#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \ + defined(__powerpc64__) at::Tag stride_tag = at::Tag::needs_fixed_stride_order; + // Helper function to release oneDNN handlers + ops.def("release_dnnl_matmul_handler(int handler) -> ()", + &release_dnnl_matmul_handler); + + // Create oneDNN W8A8 handler + ops.def( + "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType " + "output_type, bool dynamic_act_quant, bool use_azp, int " + "primitive_cache_size) -> int", + &create_onednn_scaled_mm_handler); + + // oneDNN scaled_mm for W8A8 with static per-tensor activation quantization + ops.def( + "onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, " + "Tensor? azp_adj, Tensor? bias, int handler) -> ()"); + ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm); // Compute int8 quantized tensor for given scaling factor. ops.def( @@ -168,50 +180,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { {stride_tag}); ops.impl("dynamic_scaled_int8_quant", torch::kCPU, &dynamic_scaled_int8_quant); - // W8A8 GEMM, supporting symmetric per-tensor or per-row/column - // quantization. - ops.def( - "cutlass_scaled_mm(Tensor! out, Tensor a," - " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor? bias) -> ()", - {stride_tag}); - ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm); - // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column - // quantization. - ops.def( - "cutlass_scaled_mm_azp(Tensor! out, Tensor a," - " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor azp_adj," - " Tensor? azp, Tensor? bias) -> ()", - {stride_tag}); - ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp); -#elif defined(__powerpc64__) - // Compute int8 quantized tensor for given scaling factor. - ops.def( - "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale," - "Tensor? azp) -> ()"); - ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant); - - // Compute int8 quantized tensor and scaling factor - ops.def( - "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, " - "Tensor!? azp) -> ()"); - ops.impl("dynamic_scaled_int8_quant", torch::kCPU, - &dynamic_scaled_int8_quant); - // W8A8 GEMM, supporting symmetric quantization. - ops.def( - "cutlass_scaled_mm(Tensor! out, Tensor a," - " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor? bias) -> ()"); - ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm_ppc64le); - // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column - // quantization. - ops.def( - "cutlass_scaled_mm_azp(Tensor! out, Tensor a," - " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor azp_adj," - " Tensor? azp, Tensor? bias) -> ()"); - ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp); #endif // SHM CCL diff --git a/tests/kernels/test_onednn.py b/tests/kernels/test_onednn.py new file mode 100644 index 0000000000..17692384ac --- /dev/null +++ b/tests/kernels/test_onednn.py @@ -0,0 +1,144 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Integration tests for FlexAttention backend vs default backend""" + +from typing import Optional + +import pytest +import torch + +from tests.kernels.utils import to_int8 +from vllm import _custom_ops as ops +from vllm.platforms import current_platform + +if not current_platform.is_cpu(): + pytest.skip("skipping CPU-only tests", allow_module_level=True) + +NK_FACTORS = [ + (256, 128), + (4096, 4096), + (16384, 4096), + (1023, 491), + (1001, 15), +] +M_FACTORS = [ + (16, 1, 32, 128, 64), + (1, 17, 1, 31, 17), +] +CACHE_SIZES = [2] +DTYPE = [torch.bfloat16] + + +def rand_int8(shape: tuple, device: str = "cpu"): + return to_int8(torch.rand(shape, device=device) * 255 - 128) + + +def ref_int8_scaled_mm( + a: torch.Tensor, + b: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + azp: Optional[torch.Tensor], + bias: Optional[torch.Tensor], + output_type: torch.dtype, +): + if azp is not None: + a = a.to(dtype=torch.float32) - azp.to(dtype=torch.float32) + output = torch.mm((scale_a * a.to(dtype=torch.float32)), + (scale_b * b.to(dtype=torch.float32))) + if bias is not None: + output += bias.float() + + return output.to(dtype=output_type) + + +def onednn_int8_gemm_test_helper(primitive_cache_size: int, + m: int, + n: int, + k: int, + per_tensor_a_quant: bool, + per_tensor_b_quant: bool, + use_azp: bool, + use_bias: bool, + out_dtype: torch.dtype = torch.bfloat16, + device: str = "cpu"): + # Test for a oneDNN kernel with per-tensor / per-token activation + # quantization and per-tensor / per-output channel weight quantization. + a = to_int8(torch.randn((m, k), device=device) * 5) + b = to_int8(torch.randn((n, k), device=device).t() * 5) + + a_scales_shape = (1, 1) if per_tensor_a_quant else (m, 1) + b_scales_shape = (1, 1) if per_tensor_b_quant else (1, n) + + scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32)) + scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32)) + + if use_azp: + azp = torch.rand(a_scales_shape, dtype=torch.float32) * 10 + 1.5 + azp = (azp / scale_a).round().to(dtype=torch.int32) + azp_adj = scale_b * b.sum(dim=0, keepdim=True, dtype=torch.float32) + else: + azp = None + azp_adj = None + + if use_bias: + bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10 + else: + bias = None + + handler = ops.create_onednn_scaled_mm( + b, + scale_b, + out_dtype, + not per_tensor_a_quant, + use_azp, + primitive_cache_size, + ) + + out = torch.zeros((m, n), dtype=out_dtype) + ops.onednn_scaled_mm(handler, a, out, scale_a, azp, azp_adj, bias) + baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, bias, out_dtype) + + torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0) + + if use_bias: + # To test runtime bias setting + out = torch.zeros((m, n), dtype=out_dtype) + ops.onednn_scaled_mm(handler, a, out, scale_a, azp, azp_adj, None) + baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, None, + out_dtype) + + torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0) + + +@pytest.mark.parametrize("n,k", NK_FACTORS) +@pytest.mark.parametrize("m_list", M_FACTORS) +@pytest.mark.parametrize("per_tensor_a_scale", [True, False]) +@pytest.mark.parametrize("per_tensor_b_scale", [True, False]) +@pytest.mark.parametrize("use_bias", [True, False]) +@pytest.mark.parametrize("use_azp", [True, False]) +@pytest.mark.parametrize("output_type", DTYPE) +@pytest.mark.parametrize("primitive_cache_size", CACHE_SIZES) +def test_onednn_int8_scaled_gemm( + n: int, + k: int, + m_list: tuple[int], + per_tensor_a_scale: bool, + per_tensor_b_scale: bool, + use_bias: bool, + use_azp: bool, + output_type: torch.dtype, + primitive_cache_size: int, +): + for m in m_list: + onednn_int8_gemm_test_helper( + primitive_cache_size=primitive_cache_size, + m=m, + n=n, + k=k, + per_tensor_a_quant=per_tensor_a_scale, + per_tensor_b_quant=per_tensor_b_scale, + use_bias=use_bias, + use_azp=use_azp, + out_dtype=output_type, + ) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 59f2d7737f..3081aff114 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1827,3 +1827,86 @@ if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"): M = mat1.size(0) N = mat2.size(0) return torch.empty((M, N), dtype=out_dtype) + + +class CPUDNNLGEMMHandler: + + def __init__(self) -> None: + self.handler: Optional[int] = None + self.n = -1 + self.k = -1 + + def __del__(self): + if self.handler is not None: + torch.ops._C.release_dnnl_matmul_handler(self.handler) + + +def create_onednn_scaled_mm( + weight: torch.Tensor, # [K, N] + weight_scales: torch.Tensor, + output_type: torch.dtype, + dynamic_quant: bool, + use_azp: bool, + primitive_cache_size: int = 128, +) -> CPUDNNLGEMMHandler: + handler = CPUDNNLGEMMHandler() + handler.k, handler.n = weight.size() + handler.handler = torch.ops._C.create_onednn_scaled_mm_handler( + weight, weight_scales, output_type, dynamic_quant, use_azp, + primitive_cache_size) + return handler + + +def onednn_scaled_int8_quant(input: torch.Tensor, + scale: Optional[torch.Tensor] = None, + azp: Optional[torch.Tensor] = None, + symmetric: bool = True): + """ + Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp. + + Args: + input: The input tensor to be quantized to int8. + scale: Optional scaling factor for the int8 quantization. + When not provided, we invoke dynamic-per-token quantization. + azp: Optional zero-point for the int8 quantization. + Must be provided for asymmetric quantization if `scale` is provided. + symmetric: Whether to use symmetric quantization (scale only, azp ignored). + + Returns: + tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp. + """ + output = torch.empty_like(input, dtype=torch.int8) + token_num = input.numel() // input.shape[-1] + input = input.view((token_num, input.shape[-1])) + if scale is not None: + # static-per-tensor quantization. + assert symmetric == ( + azp + is None), "azp must only be provided for asymmetric quantization." + torch.ops._C.static_scaled_int8_quant(output, input, scale, azp) + return output, scale, azp + + # dynamic-per-token quantization. + input_scales = torch.empty((token_num, 1), + device=input.device, + dtype=torch.float32) + input_azp = None if symmetric else torch.empty_like(input_scales, + dtype=torch.int32) + torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales, + input_azp) + return output, input_scales, input_azp + + +def onednn_scaled_mm( + dnnl_handler: CPUDNNLGEMMHandler, + x: torch.Tensor, + output: torch.Tensor, + input_scale: Optional[torch.Tensor], + input_zp: Optional[torch.Tensor], + input_zp_adj: Optional[torch.Tensor], + bias: Optional[torch.Tensor], +) -> torch.Tensor: + torch.ops._C.onednn_scaled_mm(output, x, input_scale, input_zp, + input_zp_adj, bias, dnnl_handler.handler) + + return output diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index b16c21b701..fcc6987d26 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -360,10 +360,15 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): elif current_platform.is_cpu(): if current_platform.get_cpu_architecture() == CpuArchEnum.X86: from vllm.model_executor.layers.fused_moe import cpu_fused_moe - dtype = layer.w13_weight.dtype + from vllm.model_executor.layers.utils import ( + check_cpu_sgl_kernel) + dtype_w13 = layer.w13_weight.dtype + _, n_w13, k_w13 = layer.w13_weight.size() + dtype_w2 = layer.w2_weight.dtype + _, n_w2, k_w2 = layer.w2_weight.size() if (envs.VLLM_CPU_SGL_KERNEL - and torch._C._cpu._is_amx_tile_supported() - and dtype == torch.bfloat16): + and check_cpu_sgl_kernel(n_w13, k_w13, dtype_w13) + and check_cpu_sgl_kernel(n_w2, k_w2, dtype_w2)): packed_w13_weight = torch.ops._C.convert_weight_packed( layer.w13_weight) assert packed_w13_weight.size() == layer.w13_weight.size() diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 654e2ec7b2..9b1ab7af0a 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -199,11 +199,10 @@ class UnquantizedLinearMethod(LinearMethodBase): def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if current_platform.is_cpu() and envs.VLLM_CPU_SGL_KERNEL: + from vllm.model_executor.layers.utils import check_cpu_sgl_kernel N, K = layer.weight.size() dtype = layer.weight.dtype - if (torch._C._cpu._is_amx_tile_supported() - and dtype == torch.bfloat16 and N % 32 == 0 - and K % 32 == 0): + if check_cpu_sgl_kernel(N, K, dtype): packed_weight = torch.ops._C.convert_weight_packed( layer.weight) assert packed_weight.size() == layer.weight.size() @@ -215,7 +214,8 @@ class UnquantizedLinearMethod(LinearMethodBase): else: logger.warning( "CPU SGL kernels require Intel AMX support," - " bfloat16 weight, IC and OC are divisible by 32.") + " bf16/fp16/int8 weight, IC and OC are divisible by " + "32 and 16.") layer.use_cpu_sgl = False def apply(self, diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index 18f5ce04fd..2bc68ab3eb 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -6,6 +6,8 @@ from typing import Optional from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import ( AiterScaledMMLinearKernel) +from vllm.model_executor.layers.quantization.kernels.scaled_mm.cpu import ( + CPUScaledMMLinearKernel) from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import ( CutlassScaledMMLinearKernel) from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501 @@ -18,7 +20,7 @@ from vllm.platforms import PlatformEnum, current_platform # in priority/performance order (when available) _POSSIBLE_KERNELS: dict[PlatformEnum, list[type[ScaledMMLinearKernel]]] = { - PlatformEnum.CPU: [CutlassScaledMMLinearKernel], + PlatformEnum.CPU: [CPUScaledMMLinearKernel], PlatformEnum.CUDA: [CutlassScaledMMLinearKernel], PlatformEnum.ROCM: [AiterScaledMMLinearKernel, TritonScaledMMLinearKernel], PlatformEnum.TPU: [XLAScaledMMLinearKernel], diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py new file mode 100644 index 0000000000..59d2b5bce9 --- /dev/null +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py @@ -0,0 +1,206 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Optional + +import torch + +from vllm import _custom_ops as ops +from vllm import envs +from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + convert_to_channelwise) +from vllm.model_executor.layers.utils import check_cpu_sgl_kernel +from vllm.platforms import current_platform +from vllm.platforms.interface import CpuArchEnum + +from .ScaledMMLinearKernel import (ScaledMMLinearKernel, + ScaledMMLinearLayerConfig) + + +class CPUScaledMMLinearKernel(ScaledMMLinearKernel): + + @classmethod + def get_min_capability(cls) -> int: + return 75 + + @classmethod + def can_implement( + cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]: + if not current_platform.is_cpu(): + return False, "CPUScaledMM requires running on CPU." + + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + weight = getattr(layer, self.w_q_name) + dtype = weight.dtype + N, K = weight.size() + if (current_platform.get_cpu_architecture() == CpuArchEnum.X86 + and envs.VLLM_CPU_SGL_KERNEL and self.config.input_symmetric + and check_cpu_sgl_kernel(N, K, dtype)): + self.linear_method = self._apply_weights_sgl + self.process_weights_for_sgl(layer) + else: + self.linear_method = self._apply_weights_onednn + self.process_weights_for_onednn(layer) + + def process_weights_for_onednn(self, layer: torch.nn.Module) -> None: + # WEIGHT + # Transpose to [K, N] for convenience + weight = getattr(layer, self.w_q_name) + replace_parameter( + layer, self.w_q_name, + torch.nn.Parameter(weight.t().data, requires_grad=False)) + + # WEIGHT SCALE + # oneDNN kernels support only per-tensor and per-channel. + # If we have a fused module (QKV, MLP) with per tensor scales (thus N + # scales being passed to the kernel), convert to the per-channel case. + is_fused_module = len(layer.logical_widths) > 1 + weight_scale = getattr(layer, self.w_s_name) + if is_fused_module and not self.config.is_channelwise: + weight_scale = convert_to_channelwise(weight_scale, + layer.logical_widths) + replace_parameter( + layer, self.w_s_name, + torch.nn.Parameter(weight_scale.data, requires_grad=False)) + + # INPUT SCALE + if self.config.is_static_input_scheme: + input_scale = getattr(layer, self.i_s_name) + + if self.config.input_symmetric: + replace_parameter( + layer, self.i_s_name, + torch.nn.Parameter(input_scale.max(), requires_grad=False)) + setattr(layer, self.i_zp_name, None) + else: + input_zero_point = getattr(layer, self.i_zp_name) + + # reconstruct the ranges + int8_traits = torch.iinfo(torch.int8) + azps = input_zero_point.to(dtype=torch.int32) + range_max = (input_scale * (int8_traits.max - azps)).max() + range_min = (input_scale * (int8_traits.min - azps)).min() + + scale = (range_max - range_min) / (int8_traits.max - + int8_traits.min) + replace_parameter( + layer, self.i_s_name, + torch.nn.Parameter(scale, requires_grad=False)) + + azp = (int8_traits.min - + range_min / scale).round().to(dtype=torch.int32) + replace_parameter(layer, self.i_zp_name, + torch.nn.Parameter(azp, requires_grad=False)) + + else: + setattr(layer, self.i_s_name, None) + setattr(layer, self.i_zp_name, None) + + # Different from cutlass, oneDNN kernels only need the AZP adjustment + # term for dynamic quantization. And s_b should be folded into the + # term. Such as: + # s_a * s_b * [(A - zp_a)B] + bias = + # s_a * (s_b * AB) - s_a * s_b * zp_a * B + bias = + # s_a * GEMM_output - s_a * zp_a * adj + bias + if not (self.config.input_symmetric + and self.config.is_static_input_scheme): + weight = getattr(layer, self.w_q_name) + weight_scale = getattr(layer, self.w_s_name) + azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.float32) + azp_adj = azp_adj * weight_scale.squeeze() + setattr(layer, self.azp_adj_name, + torch.nn.Parameter(azp_adj, requires_grad=False)) + else: + setattr(layer, self.azp_adj_name, None) + + weight = getattr(layer, self.w_q_name) + self.dnnl_handler = ops.create_onednn_scaled_mm( + weight, + getattr(layer, self.w_s_name), + torch.get_default_dtype(), + getattr(layer, self.i_s_name) is None, + not self.config.input_symmetric, + 32, + ) + # weight is prepacked and maintained by the dnnl_handler, + # release the original weight + setattr(layer, self.w_q_name, None) + del weight + + def process_weights_for_sgl(self, layer: torch.nn.Module) -> None: + # WEIGHT + weight = getattr(layer, self.w_q_name) + packed_weight = torch.ops._C.convert_weight_packed(weight) + replace_parameter( + layer, self.w_q_name, + torch.nn.Parameter(packed_weight, requires_grad=False)) + + if layer.bias is not None: + bias = layer.bias + layer.register_parameter( + "bias_fp32", + torch.nn.Parameter(bias.float().data, requires_grad=False)) + + # WEIGHT SCALE + # CPU SGL kernels only support per-channel. + # For per-tensor quant, convert to the per-channel case. + weight_scale = getattr(layer, self.w_s_name) + if not self.config.is_channelwise: + weight_scale = convert_to_channelwise(weight_scale, + layer.logical_widths) + replace_parameter( + layer, self.w_s_name, + torch.nn.Parameter(weight_scale.data, requires_grad=False)) + + setattr(layer, self.i_s_name, None) + setattr(layer, self.i_zp_name, None) + setattr(layer, self.azp_adj_name, None) + + def apply_weights(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + return self.linear_method( + layer, + x, + bias, + ) + + def _apply_weights_onednn( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer) + + # ops.scaled_int8_quant supports both dynamic and static quant: + # * dynamic, i_s is None and x_s computed from x. + # * static, i_s is scalar and x_s is i_s. + x_q, x_s, x_zp = ops.onednn_scaled_int8_quant( + x, i_s, i_zp, self.config.input_symmetric) + + m = x.size(0) + n = self.dnnl_handler.n + out = torch.empty((m, n), dtype=x.dtype) + ops.onednn_scaled_mm(self.dnnl_handler, x_q, out, x_s, x_zp, azp_adj, + bias) + + return out + + def _apply_weights_sgl( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + w_q, w_s, _, _, _ = self._get_weight_params(layer) + return torch.ops._C.int8_scaled_mm_with_quant( + x, + w_q, + w_s, + layer.bias_fp32 if bias is not None else None, + x.dtype, + True, + ) diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py index 6ddd4a9ec4..2f982f96b0 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py @@ -25,8 +25,8 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel): def can_implement( cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]: - if (not current_platform.is_cuda() and not current_platform.is_cpu()): - return False, "CutlassScaledMM requires running on CUDA or CPU." + if not current_platform.is_cuda(): + return False, "CutlassScaledMM requires running on CUDA." return True, None diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index 48a347a8f5..2897f75b31 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -142,6 +142,12 @@ direct_register_custom_op( ) +def check_cpu_sgl_kernel(n: int, k: int, dtype: torch.dtype): + return (torch._C._cpu._is_amx_tile_supported() + and (dtype in (torch.bfloat16, torch.int8)) and k % 32 == 0 + and n % 16 == 0) + + def cpu_unquantized_gemm(layer: torch.nn.Module, x: torch.Tensor, weight: torch.Tensor, From 2461d9e562e5852555c76e0dbed06979f9c6c688 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 21 Aug 2025 11:05:20 +0800 Subject: [PATCH 454/932] [CI/Build] Split out mm processor tests (#23260) Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 15 +++++++++++---- .../{ => processing}/test_tensor_schema.py | 7 +++---- vllm/model_executor/models/cohere2_vision.py | 2 ++ 3 files changed, 16 insertions(+), 8 deletions(-) rename tests/models/multimodal/{ => processing}/test_tensor_schema.py (98%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 7454206640..5869ae21d5 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -545,6 +545,15 @@ steps: commands: - pytest -v -s models/language/pooling -m 'not core_model' +- label: Multi-Modal Processor Test + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py + - pytest -v -s models/multimodal/processing/test_tensor_schema.py + - label: Multi-Modal Models Test (Standard) mirror_hardwares: [amdexperimental] torch_nightly: true @@ -554,9 +563,7 @@ steps: commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip freeze | grep -E 'torch' - - pytest -v -s models/multimodal/processing - - pytest -v -s --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/test_tensor_schema.py models/multimodal -m core_model - - pytest -v -s models/multimodal/test_tensor_schema.py -m core_model # Needs mp_method="spawn" + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - label: Multi-Modal Models Test (Extended) 1 @@ -567,7 +574,7 @@ steps: - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model' + - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing - label: Multi-Modal Models Test (Extended) 2 mirror_hardwares: [amdexperimental] diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py similarity index 98% rename from tests/models/multimodal/test_tensor_schema.py rename to tests/models/multimodal/processing/test_tensor_schema.py index 143b4c8fc8..79164f02c3 100644 --- a/tests/models/multimodal/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -24,9 +24,9 @@ from vllm.utils import GiB_bytes, is_list_of, set_default_torch_num_threads from vllm.v1.core.kv_cache_utils import get_kv_cache_config from vllm.v1.engine.core import EngineCore as V1EngineCore -from ...conftest import VllmRunner -from ..registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS -from ..utils import dummy_hf_overrides +from ....conftest import VllmRunner +from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS +from ...utils import dummy_hf_overrides ARCH_TO_SKIP = { "MolmoForCausalLM": "incompatible requirements", @@ -147,7 +147,6 @@ def get_model_id_to_test( return filtered_results -@pytest.mark.core_model @pytest.mark.parametrize( "model_arch, model_id", get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys())) diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index fca1aee835..179cc2af8e 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -170,6 +170,8 @@ class Cohere2VisionProcessingInfo(BaseProcessingInfo): # The current implementation of get_number_of_image_patches # is incorrect, so we patch it here. + # TODO: Revert once + # https://github.com/huggingface/transformers/pull/40312 is released. # return image_processor.get_number_of_image_patches(image_height, # image_width, {}) From 3663870c72da246d81d8bd8f5c059890fb3f3f5d Mon Sep 17 00:00:00 2001 From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com> Date: Thu, 21 Aug 2025 06:08:51 +0300 Subject: [PATCH 455/932] [V1][Mamba1] - Full CUDA and Piecewise CUDA Graphs Support (#23035) Signed-off-by: asafg Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com> Co-authored-by: asafg --- docs/usage/v1_guide.md | 2 +- .../models/language/generation/test_hybrid.py | 20 ++---- vllm/config/compilation.py | 1 + .../layers/mamba/mamba_mixer.py | 66 ++++++++++++++++--- vllm/model_executor/models/jamba.py | 8 ++- vllm/model_executor/models/mamba.py | 7 +- vllm/v1/attention/backends/mamba1_attn.py | 37 +++++------ vllm/v1/attention/backends/mamba2_attn.py | 45 ++----------- vllm/v1/attention/backends/mamba_attn.py | 55 ++++++++++++++++ 9 files changed, 154 insertions(+), 87 deletions(-) create mode 100644 vllm/v1/attention/backends/mamba_attn.py diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 54af970ea8..9bf0c5842c 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -107,7 +107,7 @@ to enable simultaneous generation and embedding using the same engine instance i #### Mamba Models Models using selective state-space mechanisms instead of standard transformer attention are supported. -Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1. Additionally, Mamba-1 models require `enforce_eager=True`. +Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1. Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`, `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index aee0a50336..f8c0eaa8cf 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -54,16 +54,14 @@ V1_SUPPORTED_MODELS = [ "tiiuae/Falcon-H1-0.5B-Base", ] +FULL_CUDA_GRAPH_MODELS = [ + "ai21labs/Jamba-tiny-dev", + "Zyphra/Zamba2-1.2B-instruct", +] + # Avoid OOM MAX_NUM_SEQS = 4 -# Once we add support for FCG in Mamba1, this list will be removed and tests -# all test cases will use enforce_eager=False -ENFORCE_EAGER_MODELS_V1 = [ - "state-spaces/mamba-130m-hf", - "ai21labs/Jamba-tiny-dev", -] - @pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS) @pytest.mark.parametrize("max_tokens", [64]) @@ -101,19 +99,13 @@ def test_models( example_prompts, max_tokens, num_logprobs) if model in V1_SUPPORTED_MODELS: - enforce_eager = False with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") if model in HYBRID_MODELS: # required due to reorder_batch behaviour m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") - - if model in ENFORCE_EAGER_MODELS_V1: - enforce_eager = True - with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS, - enforce_eager=enforce_eager, enable_prefix_caching=False) as vllm_model: vllm_v1_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) @@ -373,7 +365,7 @@ def test_distributed_correctness( ) -@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"]) +@pytest.mark.parametrize("model", FULL_CUDA_GRAPH_MODELS) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) def test_full_cuda_graph( diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 56a2183f8e..c654485f4f 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -336,6 +336,7 @@ class CompilationConfig: "vllm.unified_attention", "vllm.unified_attention_with_output", "vllm.mamba_mixer2", + "vllm.mamba_mixer", ] def compute_hash(self) -> str: diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index 3c7322260d..a24e72778b 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -27,6 +27,8 @@ from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( selective_scan_fn, selective_state_update) from vllm.model_executor.models.mamba_cache import MambaCacheParams from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform +from vllm.utils import direct_register_custom_op from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionMetadata @@ -183,22 +185,26 @@ class MambaMixer(MambaBase, CustomOp): def forward(self, hidden_states: torch.Tensor, + output: torch.Tensor, mamba_cache_params: Optional[MambaCacheParams] = None): if not envs.VLLM_USE_V1: - return CustomOp.forward(self, hidden_states, mamba_cache_params) + CustomOp.forward(self, hidden_states, output, mamba_cache_params) else: - return self.forward_cuda( + torch.ops.vllm.mamba_mixer( hidden_states, - mamba_cache_params, + output, + self.prefix, ) def forward_native(self, hidden_states: torch.Tensor, + output: torch.Tensor, mamba_cache_params: Optional[MambaCacheParams] = None): pass def forward_cuda(self, hidden_states: torch.Tensor, + output: torch.Tensor, mamba_cache_params: Optional[MambaCacheParams] = None): """ Run the Mamba-1 SSM pipeline. @@ -237,6 +243,7 @@ class MambaMixer(MambaBase, CustomOp): conv_state = self_kv_cache[0].transpose(-1, -2) ssm_state = self_kv_cache[1] has_initial_states = mamba1_metadata.has_initial_states + num_padded_decodes = mamba1_metadata.num_padded_decodes else: assert isinstance(attn_metadata, AttentionMetadata) assert mamba_cache_params is not None @@ -248,6 +255,7 @@ class MambaMixer(MambaBase, CustomOp): has_initial_states = None if context_lens_tensor is not None: has_initial_states = context_lens_tensor > 0 + num_padded_decodes = attn_metadata.num_decode_tokens # 1. Gated MLP's linear projection projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1) @@ -267,6 +275,7 @@ class MambaMixer(MambaBase, CustomOp): num_decodes = attn_metadata.num_decode_tokens # token count (=request) has_prefill = num_prefill_tokens > 0 has_decode = num_decode_tokens > 0 + num_actual_tokens = num_prefill_tokens + num_decode_tokens prefill_decode_split = split_batch_to_prefill_and_decode( hidden_states_BC, @@ -278,6 +287,7 @@ class MambaMixer(MambaBase, CustomOp): num_decode_tokens, num_prefills, num_decodes, + num_padded_decodes, ) hidden_states_BC_p = prefill_decode_split.hidden_states_BC_p hidden_states_BC_d = prefill_decode_split.hidden_states_BC_d @@ -371,7 +381,7 @@ class MambaMixer(MambaBase, CustomOp): else: out = self.out_proj(scan_outputs_combined.transpose(-2, -1))[0] - return out + output[:num_actual_tokens] = out def get_state_dtype(self) -> tuple[torch.dtype]: assert self.model_config is not None @@ -421,18 +431,27 @@ def split_batch_to_prefill_and_decode( num_decode_tokens: int, num_prefills: int, num_decodes: int, + num_padded_decodes: int, ) -> PrefillDecodeSplit: + num_actual_tokens = num_prefill_tokens + num_padded_decodes + if envs.VLLM_USE_V1: # In v1, decode tokens come first, then prefill tokens. hidden_states_BC_d, hidden_states_BC_p = torch.split( - hidden_states_BC, [num_decode_tokens, num_prefill_tokens], dim=-1) - gate_d, gate_p = torch.split(gate, - [num_decode_tokens, num_prefill_tokens], + hidden_states_BC[..., :num_actual_tokens], + [num_padded_decodes, num_prefill_tokens], + dim=-1) + gate_d, gate_p = torch.split(gate[..., :num_actual_tokens], + [num_padded_decodes, num_prefill_tokens], dim=-1) + + # num_padded_decodes accounts for CUDA graph padding when applicable state_indices_tensor_d, state_indices_tensor_p = torch.split( - state_indices_tensor, [num_decodes, num_prefills], dim=0) + state_indices_tensor[:num_padded_decodes + num_prefills], + [num_padded_decodes, num_prefills], + dim=0) query_start_loc_p = (query_start_loc[-num_prefills - 1:] - - num_decodes if num_prefills > 0 else None) + num_padded_decodes if num_prefills > 0 else None) has_initial_states_p = has_initial_states[-num_prefills:] if ( has_initial_states is not None and num_prefills > 0) else None else: @@ -459,3 +478,32 @@ def split_batch_to_prefill_and_decode( query_start_loc_p=query_start_loc_p, has_initial_states_p=has_initial_states_p, ) + + +def mamba_mixer( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + self.forward_cuda(hidden_states=hidden_states, + output=output, + mamba_cache_params=None) + + +def mamba_mixer_fake( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="mamba_mixer", + op_func=mamba_mixer, + mutates_args=["output"], + fake_impl=mamba_mixer_fake, + dispatch_key=current_platform.dispatch_key, +) diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 0b32d6f256..3c1a0b68df 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -10,6 +10,7 @@ from transformers import JambaConfig from vllm import envs from vllm.attention.layer import Attention +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group @@ -154,10 +155,10 @@ class JambaMambaDecoderLayer(nn.Module): hidden_states, residual = self.input_layernorm( hidden_states, residual) - hidden_states = self.mamba(hidden_states, mamba_cache_params) + output = torch.empty_like(hidden_states) + self.mamba(hidden_states, output, mamba_cache_params) # Fully Connected - hidden_states, residual = self.pre_ff_layernorm( - hidden_states, residual) + hidden_states, residual = self.pre_ff_layernorm(output, residual) hidden_states = self.feed_forward(hidden_states) return hidden_states, residual @@ -278,6 +279,7 @@ ALL_DECODER_LAYER_TYPES = { } +@support_torch_compile class JambaModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index f4aaf0c6f4..f02499a4f9 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -9,6 +9,7 @@ from torch import nn from transformers import MambaConfig from vllm import envs +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.layernorm import RMSNorm @@ -81,10 +82,12 @@ class MambaDecoderLayer(nn.Module): else: hidden_states, residual = self.norm(hidden_states, residual) - hidden_states = self.mixer(hidden_states, mamba_cache_params) - return hidden_states, residual + output = torch.empty_like(hidden_states) + self.mixer(hidden_states, output, mamba_cache_params) + return output, residual +@support_torch_compile class MambaModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py index 6cdc509083..97a1aa86dd 100644 --- a/vllm/v1/attention/backends/mamba1_attn.py +++ b/vllm/v1/attention/backends/mamba1_attn.py @@ -2,16 +2,16 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import ClassVar, Optional +from typing import Optional import torch from vllm.attention.backends.abstract import AttentionBackend -from vllm.config import VllmConfig -from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, - CommonAttentionMetadata, +from vllm.attention.backends.utils import PAD_SLOT_ID +from vllm.v1.attention.backends.mamba_attn import ( + BaseMambaAttentionMetadataBuilder) +from vllm.v1.attention.backends.utils import (CommonAttentionMetadata, split_decodes_and_prefills) -from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec class Mamba1AttentionBackend(AttentionBackend): @@ -31,24 +31,11 @@ class Mamba1AttentionMetadata: num_prefill_tokens: int num_decodes: int num_decode_tokens: int + num_padded_decodes: int class Mamba1AttentionMetadataBuilder( - AttentionMetadataBuilder[Mamba1AttentionMetadata]): - reorder_batch_threshold: ClassVar[int] = 1 - - def __init__( - self, - kv_cache_spec: AttentionSpec, - vllm_config: VllmConfig, - device: torch.device, - layer_names: list[str], - ): - assert isinstance(kv_cache_spec, MambaSpec) - self.kv_cache_spec = kv_cache_spec - self.device = device - self.vllm_config = vllm_config - self.layer_names = layer_names + BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata]): def build( self, @@ -67,9 +54,18 @@ class Mamba1AttentionMetadataBuilder( decode_threshold=1)) has_initial_states = None + padded_decodes = num_decodes if num_prefills > 0: has_initial_states = context_lens_tensor > 0 + elif (num_decodes > 0 and num_decodes <= self.decode_cudagraph_max_bs + and self.compilation_config.full_cuda_graph): + state_indices_for_decode = state_indices_tensor[:num_decodes] + padded_decodes = self.vllm_config.pad_for_cudagraph(num_decodes) + self.state_indices_tensor[:num_decodes].copy_( + state_indices_for_decode, non_blocking=True) + state_indices_tensor = self.state_indices_tensor[:padded_decodes] + state_indices_tensor[num_decodes:] = PAD_SLOT_ID return Mamba1AttentionMetadata( query_start_loc=query_start_loc, @@ -80,4 +76,5 @@ class Mamba1AttentionMetadataBuilder( num_prefill_tokens=num_prefill_tokens, num_decodes=num_decodes, num_decode_tokens=num_decode_tokens, + num_padded_decodes=padded_decodes, ) diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index ace078e2b2..ed30884fdb 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -2,18 +2,18 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from dataclasses import dataclass -from typing import ClassVar, Optional +from typing import Optional import torch from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.config import VllmConfig -from vllm.v1.attention.backends.utils import (AttentionCGSupport, - AttentionMetadataBuilder, - CommonAttentionMetadata, +from vllm.v1.attention.backends.mamba_attn import ( + BaseMambaAttentionMetadataBuilder) +from vllm.v1.attention.backends.utils import (CommonAttentionMetadata, split_decodes_and_prefills) -from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec +from vllm.v1.kv_cache_interface import AttentionSpec def _query_start_loc_to_chunk_indices_offsets(query_start_loc: torch.Tensor, @@ -88,29 +88,14 @@ class Mamba2AttentionMetadata: class Mamba2AttentionMetadataBuilder( - AttentionMetadataBuilder[Mamba2AttentionMetadata]): - cudagraph_support: ClassVar[AttentionCGSupport] = \ - AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE - - reorder_batch_threshold: ClassVar[int] = 1 + BaseMambaAttentionMetadataBuilder[Mamba2AttentionMetadata]): def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): - assert isinstance(kv_cache_spec, MambaSpec) - self.kv_cache_spec = kv_cache_spec + super().__init__(kv_cache_spec, layer_names, vllm_config, device) self.chunk_size = vllm_config.model_config.get_mamba_chunk_size() - self.vllm_config = vllm_config - self.compilation_config = vllm_config.compilation_config assert self.chunk_size is not None, ( "chunk_size needs to be set in the model config for Mamba2 models") - self.decode_cudagraph_max_bs = min( - self.vllm_config.scheduler_config.max_num_seqs, - self.compilation_config.max_capture_size) - self.state_indices_tensor = torch.empty( - (self.decode_cudagraph_max_bs, ), - dtype=torch.int32, - device=device, - ) def build(self, common_prefix_len: int, @@ -187,19 +172,3 @@ class Mamba2AttentionMetadataBuilder( state_indices_tensor=state_indices_tensor, ) return attn_metadata - - def build_for_cudagraph_capture( - self, common_attn_metadata: CommonAttentionMetadata): - """ - This method builds the metadata for full cudagraph capture. - Currently, only decode is supported for full cudagraphs with Mamba. - """ - m = common_attn_metadata - - assert m.num_reqs == m.num_actual_tokens, \ - "Mamba only supports decode-only full CUDAGraph capture. " \ - "Make sure all cudagraph capture sizes <= max_num_seq." - - m.max_query_len = 1 # decode-only - - return self.build(0, m) diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py new file mode 100644 index 0000000000..07ef7cb69a --- /dev/null +++ b/vllm/v1/attention/backends/mamba_attn.py @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import abc +from typing import ClassVar, TypeVar + +import torch + +from vllm.config import VllmConfig +from vllm.v1.attention.backends.utils import (AttentionCGSupport, + AttentionMetadataBuilder, + CommonAttentionMetadata) +from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec + +M = TypeVar("M") + + +class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC): + reorder_batch_threshold: ClassVar[int] = 1 + cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE + + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device): + assert isinstance(kv_cache_spec, MambaSpec) + self.kv_cache_spec = kv_cache_spec + self.device = device + self.vllm_config = vllm_config + self.layer_names = layer_names + + self.compilation_config = vllm_config.compilation_config + self.decode_cudagraph_max_bs = min( + self.vllm_config.scheduler_config.max_num_seqs, + self.compilation_config.max_capture_size) + self.state_indices_tensor = torch.empty( + (self.decode_cudagraph_max_bs, ), + dtype=torch.int32, + device=device, + ) + + def build_for_cudagraph_capture( + self, common_attn_metadata: CommonAttentionMetadata) -> M: + """ + This method builds the metadata for full cudagraph capture. + Currently, only decode is supported for full cudagraphs with Mamba. + """ + m = common_attn_metadata + + assert m.num_reqs == m.num_actual_tokens, \ + "Mamba only supports decode-only full CUDAGraph capture. " \ + "Make sure all cudagraph capture sizes <= max_num_seq." + + m.max_query_len = 1 # decode-only + + return self.build(0, m) \ No newline at end of file From f94bf9b924afe2e720b864590c9798b911e77e66 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 20 Aug 2025 23:09:39 -0400 Subject: [PATCH 456/932] [Compile] Fix Compile Warning SM100 Cutlass MLA (#23287) Signed-off-by: yewentao256 --- csrc/attention/mla/sm100_cutlass_mla_kernel.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu index e0e95d0629..6dd6f269f3 100644 --- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu +++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu @@ -167,7 +167,7 @@ typename T::Fmha::Arguments args_from_options( // TODO(trevor-m): Change split_kv back to -1 when // https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will // perform worse with larger context length and smaller batch sizes. - num_kv_splits, // split_kv + static_cast(num_kv_splits), // split_kv nullptr, // is_var_split_kv }; // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute @@ -264,7 +264,7 @@ int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_ba // Assumes device 0 when getting sm_count. arguments.hw_info.sm_count = sm_count <= 0 ? cutlass::KernelHardwareInfo::query_device_multiprocessor_count(/*device_id=*/0) : sm_count; - arguments.split_kv = num_kv_splits; + arguments.split_kv = static_cast(num_kv_splits); MlaSm100Type::Fmha::set_split_kv(arguments); return MlaSm100Type::Fmha::get_workspace_size(arguments); From 655a09f6538e6b09af23771dcc4fcebd72a15b23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=A8=E5=A5=87=28yann=20qi=29?= <51905299+yannqi@users.noreply.github.com> Date: Thu, 21 Aug 2025 12:08:52 +0800 Subject: [PATCH 457/932] [Model][VLM] Support R-4B Model (#23246) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: yannqi Signed-off-by: 杨奇(yann qi) <51905299+yannqi@users.noreply.github.com> Signed-off-by: Cyrus Leung Co-authored-by: yannqiyang Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung --- docs/models/supported_models.md | 1 + examples/offline_inference/vision_language.py | 23 ++++ .../vision_language_multi_image.py | 34 ++++++ .../multimodal/processing/test_common.py | 1 + tests/models/registry.py | 2 + vllm/model_executor/models/registry.py | 1 + vllm/model_executor/models/rvl.py | 103 ++++++++++++++++++ 7 files changed, 165 insertions(+) create mode 100644 vllm/model_executor/models/rvl.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 7308d00106..831bfb1e93 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -652,6 +652,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + IE+ + VE+ | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + IE+ + VE+ | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎ | +| `RForConditionalGeneration` | R-VL-4B | T + IE+ | `YannQi/R-4B` | | ✅︎ | ✅︎ | | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ | | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ | | `Step3VLForConditionalGeneration` | Step3-VL | T + I+ | `stepfun-ai/step3` | | ✅︎ | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 88bbbfdfbd..e7a7a30dd3 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1436,6 +1436,28 @@ def run_qwen2_5_omni(questions: list[str], modality: str): ) +# R-4B +def run_r_vl(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + model_name = "YannQi/R-4B" + + prompts = [ + f"<|im_start|>user \n{question}<|im_end|><|im_start|>assistant\n" + for question in questions + ] + + engine_args = EngineArgs( + model=model_name, + max_model_len=16384, + limit_mm_per_prompt={modality: 1}, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # SkyworkR1V def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1622,6 +1644,7 @@ model_example_map = { "qwen2_vl": run_qwen2_vl, "qwen2_5_vl": run_qwen2_5_vl, "qwen2_5_omni": run_qwen2_5_omni, + "rvl": run_r_vl, "skywork_chat": run_skyworkr1v, "smolvlm": run_smolvlm, "step3": run_step3, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index eabd9453f3..d9242efa85 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -992,6 +992,39 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_r_vl(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "YannQi/R-4B" + engine_args = EngineArgs( + model=model_name, + max_model_len=16384, + max_num_seqs=16, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] + + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" @@ -1193,6 +1226,7 @@ model_example_map = { "qwen_vl_chat": load_qwen_vl_chat, "qwen2_vl": load_qwen2_vl, "qwen2_5_vl": load_qwen2_5_vl, + "rvl": load_r_vl, "smolvlm": load_smolvlm, "step3": load_step3, "tarsier": load_tarsier, diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 02aecfad82..adc8b2510d 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -316,6 +316,7 @@ def _test_processing_correctness_one( "Qwen/Qwen2.5-VL-3B-Instruct", "Qwen/Qwen2-Audio-7B-Instruct", "Qwen/Qwen2.5-Omni-3B", + "YannQi/R-4B", "Skywork/Skywork-R1V-38B", "HuggingFaceTB/SmolVLM2-2.2B-Instruct", "stepfun-ai/step3", diff --git a/tests/models/registry.py b/tests/models/registry.py index 6e6acfb8cd..4f69f90b6a 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -489,6 +489,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { max_model_len=4096), "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"), "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501 + "RForConditionalGeneration": _HfExamplesInfo("YannQi/R-4B", + trust_remote_code=True), "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B", trust_remote_code=True), "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct", # noqa: E501 diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 78ef270598..39a3e425a4 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -217,6 +217,7 @@ _MULTIMODAL_MODELS = { "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"), "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"), # noqa: E501 "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"), + "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"), "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501 "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"), "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), diff --git a/vllm/model_executor/models/rvl.py b/vllm/model_executor/models/rvl.py new file mode 100644 index 0000000000..efdb010046 --- /dev/null +++ b/vllm/model_executor/models/rvl.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Mapping + +import torch +import torch.nn as nn +from transformers.activations import GELUActivation + +from vllm.config import VllmConfig +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalDataDict + +from .llava_next import (LlavaDummyInputsBuilder, LlavaNextMultiModalProcessor, + LlavaNextProcessingInfo) +from .llava_onevision import LlavaOnevisionForConditionalGeneration +from .utils import WeightsMapper + + +class RVLProcessingInfo(LlavaNextProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config() + + def get_hf_processor(self, **kwargs: object): + return self.ctx.get_hf_processor(**kwargs) + + +class RVLDummyInputsBuilder(LlavaDummyInputsBuilder[RVLProcessingInfo]): + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + image_token = "" + + return image_token * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + + target_width, target_height = ( + self.info.get_image_size_with_most_features()) + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + } + + +class RVLMultiModalProjector(nn.Module): + + def __init__(self, config): + super().__init__() + self.pre_norm = nn.LayerNorm(config.vision_config.hidden_size, + eps=1e-06) + self.linear_1 = nn.Linear( + config.vision_config.hidden_size, + config.text_config.hidden_size, + bias=True, + ) + self.act = GELUActivation() + self.linear_2 = nn.Linear( + config.text_config.hidden_size, + config.text_config.hidden_size, + bias=True, + ) + + def forward(self, image_feature: torch.Tensor) -> torch.Tensor: + image_feature = self.pre_norm(image_feature) + hidden_states = self.linear_1(image_feature) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + + return hidden_states + + +@MULTIMODAL_REGISTRY.register_processor( + LlavaNextMultiModalProcessor, + info=RVLProcessingInfo, + dummy_inputs=RVLDummyInputsBuilder, +) +class RForConditionalGeneration(LlavaOnevisionForConditionalGeneration): + + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + # mapping for new names in checkpoint saved after transformers + # v4.52 + "model.language_model.": "language_model.model.", + "model.vision_tower.": "vision_tower.", + "model.multi_modal_projector.": "multi_modal_projector.", + "model.image_newline": "image_newline", + "lm_head.": "language_model.lm_head.", + }) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__(vllm_config=vllm_config, prefix=prefix) + config = vllm_config.model_config.hf_config + self.multi_modal_projector = RVLMultiModalProjector(config) From 8993073dc1a7e2d31eda85812b76789046ae7c28 Mon Sep 17 00:00:00 2001 From: QiliangCui Date: Thu, 21 Aug 2025 04:15:20 +0000 Subject: [PATCH 458/932] [CI] Delete images older than 24h. (#23291) Signed-off-by: Qiliang Cui --- .buildkite/scripts/tpu/cleanup_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/scripts/tpu/cleanup_docker.sh b/.buildkite/scripts/tpu/cleanup_docker.sh index 209d9c4341..740d81fb39 100755 --- a/.buildkite/scripts/tpu/cleanup_docker.sh +++ b/.buildkite/scripts/tpu/cleanup_docker.sh @@ -17,7 +17,7 @@ if [ "$disk_usage" -gt "$threshold" ]; then # Remove dangling images (those that are not tagged and not used by any container) docker image prune -f # Remove unused volumes / force the system prune for old images as well. - docker volume prune -f && docker system prune --force --filter "until=72h" --all + docker volume prune -f && docker system prune --force --filter "until=24h" --all echo "Docker images and volumes cleanup completed." else echo "Disk usage is below $threshold%. No cleanup needed." From f64ee61d9e7014a5f230a8347186b952dbe483de Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 21 Aug 2025 00:21:05 -0400 Subject: [PATCH 459/932] [CI] Block the cu126 wheel build while broken (#23285) Signed-off-by: mgoin --- .buildkite/release-pipeline.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index e20ce54ca7..f96c38bf57 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -27,7 +27,12 @@ steps: env: DOCKER_BUILDKIT: "1" + - block: "Build CUDA 12.6 wheel" + key: block-build-cu126-wheel + depends_on: ~ + - label: "Build wheel - CUDA 12.6" + depends_on: block-build-cu126-wheel id: build-wheel-cuda-12-6 agents: queue: cpu_queue_postmerge From f571ff8eb6d9117c6a418f7f925921968dff8ac8 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Wed, 20 Aug 2025 21:28:32 -0700 Subject: [PATCH 460/932] [Sampler] Support returning final logprobs (#22387) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: Nick Hill Co-authored-by: Woosuk Kwon --- docs/usage/v1_guide.md | 7 ++- tests/v1/sample/test_logprobs.py | 10 ++-- vllm/config/__init__.py | 30 ++++++---- vllm/engine/arg_utils.py | 1 + vllm/v1/sample/ops/topk_topp_sampler.py | 65 ++++++++++---------- vllm/v1/sample/sampler.py | 79 +++++++++++++++++++------ vllm/v1/sample/tpu/sampler.py | 2 +- 7 files changed, 125 insertions(+), 69 deletions(-) diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 9bf0c5842c..b897689136 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -154,12 +154,15 @@ differences compared to V0: ##### Logprobs Calculation -Logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e. +By default, logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e. before applying any logits post-processing such as temperature scaling or penalty adjustments). As a result, the returned logprobs do not reflect the final adjusted probabilities used during sampling. -Support for logprobs with post-sampling adjustments is in progress and will be added in future updates. +You can adjust this behavior by setting the `--logprobs-mode` flag. +Four modes are supported: `raw_logprobs` (default), `processed_logprobs`, `raw_logits`, `processed_logits`. +Raw means the values before applying any logit processors, like bad words. +Processed means the values after applying all processors, including temperature and top_k/top_p. ##### Prompt Logprobs with Prefix Caching diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 8bd142e87b..e835c02963 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -456,9 +456,7 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch): assert len(logprob) == vocab_size -@pytest.mark.parametrize( - "logprobs_mode", - ["raw_logprobs", "raw_logits", "processed_logprobs", "processed_logits"]) +@pytest.mark.parametrize("logprobs_mode", list(LogprobsMode)) def test_logprobs_mode(logprobs_mode: LogprobsMode, monkeypatch: pytest.MonkeyPatch): """Test with LLM engine with different logprobs_mode. @@ -487,12 +485,14 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode, for logprobs in output.logprobs: for token_id in logprobs: logprob = logprobs[token_id] - if "logprobs" in logprobs_mode: + if logprobs_mode in (LogprobsMode.RAW_LOGPROBS, + LogprobsMode.PROCESSED_LOGPROBS): assert logprob.logprob <= 0 if logprob.logprob > 0: positive_values = positive_values + 1 total_token_with_logprobs = total_token_with_logprobs + 1 assert total_token_with_logprobs >= len(results[0].outputs) - if "logits" in logprobs_mode: + if logprobs_mode in (LogprobsMode.RAW_LOGITS, + LogprobsMode.PROCESSED_LOGITS): assert positive_values > 0 del llm diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 959f111ced..2973cb92d1 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -257,11 +257,16 @@ def is_init_field(cls: ConfigType, name: str) -> bool: TokenizerMode = Literal["auto", "slow", "mistral", "custom"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] -LogprobsMode = Literal["raw_logprobs", "raw_logits", "processed_logprobs", - "processed_logits"] MMEncoderTPMode = Literal["weights", "data"] +class LogprobsMode(enum.Enum): + RAW_LOGITS = "raw_logits" + RAW_LOGPROBS = "raw_logprobs" + PROCESSED_LOGITS = "processed_logits" + PROCESSED_LOGPROBS = "processed_logprobs" + + @config @dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class ModelConfig: @@ -363,12 +368,13 @@ class ModelConfig: specified in `SamplingParams`. The default value comes the default for the OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length * vocab_size) logprobs are allowed to be returned and it may cause OOM.""" - logprobs_mode: LogprobsMode = "raw_logprobs" + logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS """Indicates the content returned in the logprobs and prompt_logprobs. Supported mode: 1) raw_logprobs, 2) processed_logprobs, 3) raw_logits, 4) processed_logits. - Raw means the values before applying logit processors, like bad words. - Processed means the values after applying such processors. + Raw means the values before applying any logit processors, like bad words. + Processed means the values after applying all processors, including + temperature and top_k/top_p. """ disable_sliding_window: bool = False """Whether to disable sliding window. If True, we will disable the sliding @@ -2586,7 +2592,7 @@ class MultiModalConfig: skip_mm_profiling: bool = False """ - When enabled, skips multimodal memory profiling and only profiles with + When enabled, skips multimodal memory profiling and only profiles with language backbone model during engine initialization. This reduces engine startup time but shifts the responsibility to users for @@ -2649,24 +2655,24 @@ class PoolerConfig: ## for embeddings models normalize: Optional[bool] = None """ - Whether to normalize the embeddings outputs. + Whether to normalize the embeddings outputs. """ dimensions: Optional[int] = None """ - Reduce the dimensions of embeddings if model + Reduce the dimensions of embeddings if model support matryoshka representation. """ ## for classification models activation: Optional[bool] = None """ - Whether to apply activation function to the classification outputs. + Whether to apply activation function to the classification outputs. """ ## for reward models softmax: Optional[bool] = None """ - Whether to apply softmax to the reward outputs. + Whether to apply softmax to the reward outputs. """ step_tag_id: Optional[int] = None """ @@ -2692,9 +2698,9 @@ class PoolerConfig: max_embed_len: Optional[int] = None """ - Maximum input length allowed for embedding generation. When set, allows + Maximum input length allowed for embedding generation. When set, allows inputs longer than max_embed_len to be accepted for embedding models. - This parameter enables accepting long inputs without requiring + This parameter enables accepting long inputs without requiring VLLM_ALLOW_LONG_MAX_MODEL_LEN environment variable. When an input exceeds max_embed_len, it will be handled according to the original max_model_len validation logic. Defaults to None (i.e. set to max_model_len). diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f3afc015f6..b0f50b4429 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -516,6 +516,7 @@ class EngineArgs: model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"]) model_group.add_argument("--logprobs-mode", + choices=[f.value for f in LogprobsMode], **model_kwargs["logprobs_mode"]) model_group.add_argument("--disable-sliding-window", **model_kwargs["disable_sliding_window"]) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index e0434c8f3d..7bd4a5a380 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -8,6 +8,7 @@ import torch.nn as nn from packaging import version from vllm import envs +from vllm.config import LogprobsMode from vllm.logger import init_logger from vllm.platforms import current_platform @@ -28,9 +29,16 @@ class TopKTopPSampler(nn.Module): Implementations may update the logits tensor in-place. """ - def __init__(self): + def __init__( + self, + logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS) -> None: super().__init__() - if current_platform.is_cuda(): + self.logprobs_mode = logprobs_mode + # flashinfer optimization does not apply if intermediate + # logprobs/logits after top_k/top_p need to be returned + if logprobs_mode not in (LogprobsMode.PROCESSED_LOGITS, + LogprobsMode.PROCESSED_LOGPROBS + ) and current_platform.is_cuda(): if is_flashinfer_available: flashinfer_version = flashinfer.__version__ if version.parse(flashinfer_version) < version.parse("0.2.3"): @@ -63,10 +71,12 @@ class TopKTopPSampler(nn.Module): "native implementation of top-p & top-k sampling. For the " "best performance, please install FlashInfer.") self.forward = self.forward_native - elif current_platform.is_tpu(): - self.forward = self.forward_tpu else: self.forward = self.forward_native + if current_platform.is_tpu(): + self.apply_top_k_top_p = apply_top_k_top_p_tpu + else: + self.apply_top_k_top_p = apply_top_k_top_p def forward_native( self, @@ -74,15 +84,20 @@ class TopKTopPSampler(nn.Module): generators: dict[int, torch.Generator], k: Optional[torch.Tensor], p: Optional[torch.Tensor], - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """ PyTorch-native implementation of top-k and top-p sampling. The logits tensor may be updated in-place. """ - logits = apply_top_k_top_p(logits, k, p) + logits = self.apply_top_k_top_p(logits, k, p) + logits_to_return = None + if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: + logits_to_return = logits + elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: + logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32) probs = logits.softmax(dim=-1, dtype=torch.float32) - return random_sample(probs, generators) + return random_sample(probs, generators), logits_to_return def forward_cuda( self, @@ -90,34 +105,24 @@ class TopKTopPSampler(nn.Module): generators: dict[int, torch.Generator], k: Optional[torch.Tensor], p: Optional[torch.Tensor], - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """More optimized implementation for top-k and top-p sampling.""" - if k is None and p is None: - # We prefer `random_sample` over `flashinfer_sample` when sorting is - # not needed. This is because `random_sample` does not require - # CPU-GPU synchronization while `flashinfer_sample` does. - probs = logits.softmax(dim=-1, dtype=torch.float32) - return random_sample(probs, generators) - if generators: - logger.warning_once("FlashInfer 0.2.3+ does not support " - "per-request generators. Falling back to " - "PyTorch-native implementation.") + # We prefer `random_sample` over `flashinfer_sample` when sorting is + # not needed. This is because `random_sample` does not require + # CPU-GPU synchronization while `flashinfer_sample` does. + if (k is None and p is None) or generators: + if generators: + logger.warning_once("FlashInfer 0.2.3+ does not support " + "per-request generators. Falling back to " + "PyTorch-native implementation.") return self.forward_native(logits, generators, k, p) + assert self.logprobs_mode not in ( + LogprobsMode.PROCESSED_LOGITS, LogprobsMode.PROCESSED_LOGPROBS + ), "FlashInfer does not support returning logits/logprobs" # flashinfer sampling functions expect contiguous logits. # In flex_attn/triton_attn fp32 inference, logits can be non-contiguous # because of slicing operation in logits_processor. - return flashinfer_sample(logits.contiguous(), k, p, generators) - - def forward_tpu( - self, - logits: torch.Tensor, - generators: dict[int, torch.Generator], - k: Optional[torch.Tensor], - p: Optional[torch.Tensor], - ) -> torch.Tensor: - logits = apply_top_k_top_p_tpu(logits, k, p) - probs = logits.softmax(dim=-1, dtype=torch.float32) - return random_sample(probs, generators) + return flashinfer_sample(logits.contiguous(), k, p, generators), None def apply_top_k_top_p_tpu( diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 82f51298f1..70ec8a0c26 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A layer that samples the next tokens from the model's outputs.""" +from typing import Optional + import torch import torch.nn as nn @@ -18,10 +20,50 @@ _SAMPLING_EPS = 1e-5 class Sampler(nn.Module): + """ + A layer that samples the next tokens from the model's outputs + with the following steps in order: - def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs"): + 1. If logprobs are requested: + a) If `logprobs_mode` is `raw_logprobs`, compute logprobs + as the final logprobs to return. + b) If `logprobs_mode` is `raw_logits`, clone the logits + as the final logprobs to return. + 2. Convert logits to float32. + 3. Apply allowed token ids whitelist. + 4. Apply bad words exclusion. + 5. Apply logit processors which are not argmax-invariant, + i.e. that can impact greedy sampling. + a) Min tokens processor + b) Logit bias processor + 6. Apply penalties + a) Repetition penalty + b) Frequency penalty + c) Presence penalty + 7. Sample the next tokens. `sample` method performs the following steps: + a) If not `all_random`, perform greedy sampling. If `all_greedy`, + return the greedily sampled tokens and final logprobs if requested. + b) Apply temperature. + c) Apply logit processors which are argmax-invariant, by default + the min_p processor. + d) Apply top_k and/or top_p. + e) Sample the next tokens with the probability distribution. + f) If `all_random` or temperature >= epsilon (1e-5), return the + randomly sampled tokens and final logprobs if requested. Else, + return the greedily sampled tokens and logprobs if requested. + 8. Gather the logprobs of the top `max_num_logprobs` and sampled token + (if requested). Note that if the sampled token is within the top + `max_num_logprobs`, the logprob will be eventually merged in + `LogprobsProcessor` during output processing. Therefore, the + final output may contain either `max_num_logprobs + 1` or + `max_num_logprobs` logprobs. + 9. Return the final `SamplerOutput`. + """ + + def __init__(self, + logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS): super().__init__() - self.topk_topp_sampler = TopKTopPSampler() + self.topk_topp_sampler = TopKTopPSampler(logprobs_mode) self.pin_memory = is_pin_memory_available() self.logprobs_mode = logprobs_mode @@ -34,13 +76,11 @@ class Sampler(nn.Module): # temperature scaling) for the top-k logprobs. # This is different from the V0 sampler, which uses the logits that # is used for sampling (after penalties and temperature scaling). - # TODO(rob): provide option for logprobs post sampling. - # See https://vllm-dev.slack.com/archives/C07UUL8E61Z/p1735907856007919 # noqa: E501 num_logprobs = sampling_metadata.max_num_logprobs if num_logprobs is not None: - if self.logprobs_mode == "raw_logprobs": + if self.logprobs_mode == LogprobsMode.RAW_LOGPROBS: raw_logprobs = self.compute_logprobs(logits) - elif self.logprobs_mode == "raw_logits": + elif self.logprobs_mode == LogprobsMode.RAW_LOGITS: raw_logprobs = logits.clone() # Use float32 for the logits. @@ -57,15 +97,10 @@ class Sampler(nn.Module): # Apply penalties (e.g., min_tokens, freq_penalties). logits = self.apply_penalties(logits, sampling_metadata) - # Get the process logprobs or logits. - if num_logprobs is not None: - if self.logprobs_mode == "processed_logprobs": - raw_logprobs = self.compute_logprobs(logits) - elif self.logprobs_mode == "processed_logits": - raw_logprobs = logits.clone() - # Sample the next token. - sampled = self.sample(logits, sampling_metadata) + sampled, processed_logprobs = self.sample(logits, sampling_metadata) + if processed_logprobs is not None: + raw_logprobs = processed_logprobs # Convert sampled token ids to int64 (long) type to ensure compatibility # with subsequent operations that may use these values as indices. # This conversion is necessary because FlashInfer sampling operations @@ -105,7 +140,7 @@ class Sampler(nn.Module): self, logits: torch.Tensor, sampling_metadata: SamplingMetadata, - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """Sample logits based on sampling metadata. The various logits processing functions called in this method @@ -119,7 +154,13 @@ class Sampler(nn.Module): else: greedy_sampled = self.greedy_sample(logits) if sampling_metadata.all_greedy: - return greedy_sampled + processed_logprobs = None + if sampling_metadata.max_num_logprobs is not None: + if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: + processed_logprobs = logits + elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: + processed_logprobs = self.compute_logprobs(logits) + return greedy_sampled, processed_logprobs assert sampling_metadata.temperature is not None @@ -132,7 +173,7 @@ class Sampler(nn.Module): logits = processor.apply(logits) # Apply top_k and/or top_p. - random_sampled = self.topk_topp_sampler( + random_sampled, processed_logprobs = self.topk_topp_sampler( logits, sampling_metadata.generators, sampling_metadata.top_k, @@ -140,7 +181,7 @@ class Sampler(nn.Module): ) if greedy_sampled is None: - return random_sampled + return random_sampled, processed_logprobs sampled = torch.where( sampling_metadata.temperature < _SAMPLING_EPS, @@ -148,7 +189,7 @@ class Sampler(nn.Module): random_sampled, out=greedy_sampled, # Reuse tensor ) - return sampled + return sampled, processed_logprobs def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor: return logits.log_softmax(dim=-1, dtype=torch.float32) diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py index 2c9f4892bc..04545d587e 100644 --- a/vllm/v1/sample/tpu/sampler.py +++ b/vllm/v1/sample/tpu/sampler.py @@ -65,7 +65,7 @@ class Sampler(nn.Module): logits = self.apply_min_p(logits, sampling_metadata.min_p) # Apply top_k and/or top_p. - random_sampled = self.topk_topp_sampler( + random_sampled, _ = self.topk_topp_sampler( logits, sampling_metadata.generators, sampling_metadata.top_k, From 0c31e28e9520d96c451cc7f023fd0f0af549766a Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 21 Aug 2025 13:03:00 +0800 Subject: [PATCH 461/932] [Bugfix] Fix extra whitespace in strings caused by newline (#23272) Signed-off-by: DarkLight1337 --- benchmarks/benchmark_dataset.py | 6 ++++-- examples/offline_inference/vision_language.py | 15 +++++++-------- vllm/benchmarks/datasets.py | 6 ++++-- vllm/model_executor/model_loader/tpu.py | 11 ++++++----- vllm/model_executor/models/hyperclovax_vision.py | 9 ++++----- vllm/model_executor/models/phi4mm.py | 6 +++--- vllm/transformers_utils/configs/eagle.py | 4 ++-- 7 files changed, 30 insertions(+), 27 deletions(-) diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index e1a856026c..2ea4f9ccaf 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -958,8 +958,10 @@ class InstructCoderDataset(HuggingFaceDataset): for i, item in enumerate(self.data): if len(sampled_requests) >= num_requests: break - prompt = f"{item['input']}\n\n{item['instruction']} Just output \ - the code, do not include any explanation." + prompt = ( + f"{item['input']}\n\n{item['instruction']} Just output " + "the code, do not include any explanation." + ) # apply template prompt = tokenizer.apply_chat_template( diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index e7a7a30dd3..8d97ba2668 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -283,8 +283,10 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: ) prompts = [ - f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\ - {question}<|assistant|>" + ( + "<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>" + f"{question}<|assistant|>" + ) for question in questions ] @@ -767,15 +769,13 @@ def run_llava_next_video(questions: list[str], modality: str) -> ModelRequestDat def run_llava_onevision(questions: list[str], modality: str) -> ModelRequestData: if modality == "video": prompts = [ - f"<|im_start|>user

E$M3G^ens1^t?A-jDPFQ)ab0XRU(w;&a~jUG%){fkayP4L zu9T$Y{h0Q4m1VJZ*oPCCUhDnSo8u|_G-h$_0bwg;5wa9U^1Ts-u;l#O*wj*#e5~c4tJjq zOEyJ|?bWx`6eXwl9jzSi%-^O?XvS!Ye3;%&*0E2Rl1wqiY_T$1NVedJ8b+M{dL;&uFeFmzBBStc`gT*ac()arBS z+Pe1DdBK%s)$Zqn*VKiH)ZO!`BaaP_byJ&-cFSmscLiH(agmZ-1+GizHGF?7Gk{rB zBnA%U%>Eh9j|~Pqw-c^%C901jC9GB~WLAqLe! zKJ5xTKWwSUPssT`(apN6^Xp??<61GW^ZoPlMmRhVL5R2dwC2~=jyG)oABSt3onT_7 zaGhX;d}08KRWVkRFp-e~qXOQ;f`NycgFylBz=0PI@B#yaj1K{W0sf-_FX3E>|2hg@ zoeTNj@9jY+3MdLmNC5v84IPY)Z5+*PofsQBNPwnh&6U)g)MTVN4Q;Jy^^I%|jA`Ah z?Le1+al3H>Z>^1;^oiW8t!x}Q-FQg;p1}#c2YpRPLiG0(Crcg@H5oY~AzKGyB34>@ zT6z**cp@SqZU-Y1P6c7nf0qNFcu35goa{L1=v-Z0XSM7&tgM=;#^g7#V4R zGiV&$ZJhMoXlxuw|I^5SwIgioXy{;W=VWecLj-D9-@w+{iHC#)bff?N`_Jz*b~FE< zJJ~q?>lV;KI?xe123mT$|85(&lpFLdr<}Q)v6Z^8xiv66z&&`GSUI@=p8tOx`JX%f zuPfF5=SpTK#{Yfg|2p#jyi(cG*g?qF8n~qs@BciRe;5Dnga0n%rUUi-e~rX{hWYQe zz&P{5bJP9zGvkGCfQ1PH1LFsi5Ef8!13%7!TUK?v?e8>%R*W>l{x;BI#GEL|U?x5r zM7@v@*KIV222Cun;M$k1o%~tzlb{8);pZpX+DNRC_3@XSJM2BKyN4Ui(=&IU@vG5p%(LdTtB5_*&Lk%t$Aa~3SayhLKhkwrW^yP@Gs&o%M$Z3y zW`_vzW1<)8j`@3(9YO{k+*D7dw6(3~-xvQsKHODx{A2ziQBn3Y=865k?nLv8ptwEk zr1VrYt)`chOCv_czh0(v8d3d0=lHJ?X%Ja=?R?VSEw2q0H2fMS%= zo85Q#nZI+9Pby~)HLGD+QEYN2UgU;rON9ir3X>o>1s}24w41rzx2mf{&V33`l!EUqq_fE+y4m7L0q)q^MDO zU#_PlF*K`op09G=tQf6Xh$>r8Tjq-bVUXXC^W14wziJsLDO0N}6Jl?#x@|?3+8Ekw zIoMZ@a;0gSG-&(2z1(fwF4@l68=N%E(YCsyQ_4=?z2yeJhmWA^aC1BV1v5-bstawZ z_aHY27l_JpJ8#G$2>gN}oH?!S5T4ijE&Ubexhy*E8b9M?bqeWJdSVKRc(Uw0F7uTZ zXDk*ArAVrz?4Z`O4$q$(A-Dzv1XQQCeUaE3TQPj^f$kykz1|x}Rvis?zP+5O^&;~< zql}l`1c93!RW!W^KG`*}GT64>E|})vj4yc{ReXd-!bxPaZh$86zMjxOp0Al%Y->9i z`rXWWX0#emMs{xN`=mMVIL^}pi~|nV)KEc+j)w&0&XP@Q1!LtX>k{?#=}PPIpm&Ey zmj8|r`Pca>-k%w!*SlH=f(M@|z{*Z&bTE}*<{7^eAnb>1zo+%sLG*rRTmrF_)bFQ`b_wr<;gqXJ+| zWTB_q6Ges@SNOUe<17wGlN%OVp$JSJw+);Uhbl{9m4$ccHNY=!8q!Fr;Am!_+sztL zQS2!aBn`T5So3+BDZK;+OXn(W+oXT$D_^_HP5~1Kq*Q*~#xv0PoYLFlTxY!^4Z?_J z(~1j1Gji?ke3N;LvI@ZzTJ38XNa>@1v(__SBryoCgC7ahn%wcXZC7ImmgO}euOW!o z-w%6Ib0H{Wcpr<$Vl^5pFl?H4FllmF?~WG~eT%_8ZVoZbw(>#=a5`QeEh~~5SAa>B z=lk|rnl7yUyC>Sn)ctnV(>4k;k$PC>w9to*{%~IOqO%ZMNKfMCNiOP#;zR79rih&I zXD58+7zZ&B*Y!AhEQ3DfFvMSa~GZut|Muc@;^KQzH!D1k}KAQVFj_`tU zr21oFqbJnzLHuH!-FmU!FCFo7psByzkM@atv5%!BA^9Zr9qVCKsXh?6Ljxo%SI-*Q ziJ_%&uST&3g^@eJQ$ljgU5LweTbXr5cOB!aHYb5&jM1|?LMYDOoO=980Ae*A-_~ zx5tB`9@ZtRd~f?@`~G0+^be22i@!a)u?*b2t|!Did}nvZ`QEhFz*`G#o9S9jjv-tx znffAe*=#yIHJm0y_7J{cbTPS~wclCPieRb>$FS;~v=&p}snL%#`8M|>c#Qi>8Yv-^FV0_nd}n8h-QCXc@ov@gYmekh z`|INo1M<#5Zq}a;K49*%)bgHpK{8|&3!y*gkm~3H0Vh3~b;?B^Y5yhsL+RX-ZKpTM zQD`_K4ns_NsChOL>pW4&pkM6u{)AA~FHE#RM@cbNVtU;dhd2TDMY^J{1K}c znf4t_{H$nz53I|Q-MUxeQI5ULgaKia!aVEC&HkgSaX#HAC8phnFEdr&-%r)LnS62u z(!^Jz_}Br2ij7S4CSdM-sdO(bMkNar=ynuca_=zO%)QY2jAqD9`MMs6JZp)K$im}| zFxV?qB>~yUt=hZLAf6LW|LjI{s5Gj2T!r>?YV8*B4|Li-S$Ei_id07VC=huYc@66c z@dgxO*S_*T2%klmVo~=;VkcPD4pO)u4InY&m4v#&kVr#6y2)m4WS(W!Kat{}`8=v` zufz>%9cLU&6Ze<2dr0?6 zcSMbZLC%B~iF7ClOvg4!*Ww3<66>W#hW*Q7I@JRdWvU<%RHG0I#^#A9Lrj>O^I@Fz zcq!VFxdXmMszd<^2fdaY%8L9&Kq4A<$;e92|Acadi3XY6-K#uKcuv@bRnZ7(lpzR2Qk-KjHX z0nDQ7Xw-j%Y?kvr7YddSbNiLY zYP~qk?%*H1#Wc>HzwxD_KAynwZPoj31uKXXu>leiD=1i38*b(uG(s`UH`F7)Sfqf$ zTESq*U)4m4J6&LsaY^Q4llc&WJP-_j>^g4?I>dQ!vd@sAErd0te@m#PqdHob>dfLs zm9#(XW(bdpm9PAoKMs9_qnfEIE+(3FR&x|45AlF*K)N0vje&a9B2!nsGt&#_!|bkT z2OgBQ-X96R81mY=FjhqA2BA)aIE%=jUiC2uqBA7d=whm z6w%T+^6hy9r%O#$$?}KDUo;F|=y543xbJRdQ-*S)$A4$BBLun&wL$KnjgwmoVhkJo z&@ynaJ|U`VU)xDekky6H(F#c)%h*}QR>Mqmm@NOf`OP9uh#nHQ@N&crUVafihWQ|k z9MhIn>T~^h27#~|*Va4CK@DGA+cx_=i{baCSwmq0OJuMJU!1tNV}+bpBb|DOtvSzH z)Vi>lguiY2=+iFORVeRRA_R`WQD)}K(g14C4PD9k0=bax0ds61QzWQK4|O=YGPDf-KDNG}eI9N(0!V%jS zEXiBx=Uk7(4-Q8_UI|$nAtv2>8SP|PFtZ^gCrB$C_Xlt(mFnv26}Rp))#)TT!znj2 zs_PN;ZlVUD8k0wUjI8#kI3m(Bp4m|&{5h39m_RAE{EIL=DM~kwne+(=4=jn#N)st^+(go-P^so->j)s-2#XH>5zZa!`H!PKbiWOD`u_EpQuGB5aRDrkQ<- z#rznbk;3wR_Xq)@&afrgYfpvMz7LB!tD@N+>B_{bh6NbIfo$}#8j8ldxT9rOy#`o+ zXOEFTT_nD@*(2FJnu_YURx7RIvy<{R!D`(jBHxYsvS8~JhOs9aC=EKw)3$qdpoZ~a zEpgn5zdJ_}<>)npVi1r~kwhiH=+f*oE;~pyP>{(IBti{;xjZM0CY|L~laTkdd8ij6 zkZdqWNTJ{TeOdJZ4pJ$Cy(}glF?XVNxLd?H|4whNUP`I3hIWxMP=)@*kj3QQfhJPemY6N8cni&4pG7uRy-lVrG6}p?v{{9+A1FA~rWB#) z)p8>ZW3$ z70+mM%Fgy~zS!w@gMI%^J*PHu@w~u;K&oUZvuLjWJz44bB)ua6j=v@SVeCtAYwAtE z%enN0&Ib}|cJrokxI@qW_JPL~3c4UVmAdVXY~l6_mEQ01v`QkYcG&j~QH*~5Zq$bE zDu_(fT#joJs~iz~VM#1Js1Yipv)L9U z2qqPDIp45NGxKK4lCxbEC^i;noj7_Ea_*SET(n(wT)h7i;(6L!Bs%ML<`@O_j@Abp z@~F+kXA9iC7;UprU5f$1VGbii+Fi_qd4$(Wo|COf!nL?Owrzyub5?X3I@3=86F{2; zJ%lwtOfl_6^j{8`v_5POBMFU?23C)msORfvb(^Rk%Nj84It-KZq{5*qk-}=OQ*f^_ zNwn8P=3^=P%2$Y&LPY~Rs_6ED7WFjW}GI8-fXHiu#I z9byzh8#2L9B0aL19pAaeeHm^vO#Uc+7$MmZ5$uu1K7;ML7{mPTbzS*e?D?-`)AuFQ z)AzyW1y+bEvC(R-qK_oV@o^Hx4~N2#@vNf!B(`A{%|q6z9gXOHcVUqjdP7)>gQHMG z=A>u@ib52bfLK;k_7$a|tWIx#U< zRqVqjjV=$tQ)eg>Zn3adeO$$9jT-BkY~R>x5cRY5zh`09Z?sggY_?W+o>7M6D;gqQ z|EWr#NLRxQVa!A?9d?5IZSuD#yjbolD!{0`GTh?e`CAC*T0!@{ddf!gvkpzZz#&y; zl$Jelcjf!%`O0F}-jD`!Rp^G#Jy(C5`xB31Hh~zaoxafSRX56tz_#jU$PQ2(Gf%c0 zp6_bQxZdMM8BIxWjd~eH9i48bF}Vx=y`Kg<1WmgCMl}jrC+EZi+50O!@!;j* zJx`qhMa}LT$mkophmC)1Uo5^q=4h9dF1J?e)&mAzb@9c0C2UN;8`sH9nd<&RUD4~g z;85j^i+2wa+ej#LwkwzO;Sb)<*!^?)cQvG(1is-%vlSTx>b6^%k9i&5PognB<-ay= zoPh%RPh*XGWpgbhjRG!LSAK`E!7@i2nIoFnJ-7CaM_YHXNq5Wl0c-BQdN%|O_KHsdRyscX9RQ5Mx0uzmDODjGGAs` z*|}ODPzIDbr@ETiS7ih2kiZs=EIRWApZ@B_`w>z{#B+3DFwGQTr*FM&CeW}%KgRIf zrdImH1E84eAv7`pp2zimUbb&IsA!n^4bz9;EgkUl z;{8giEAr+yoteag+GI@|j+@1ZCT+4$iviQ7ybZ8Imx4Fu1TnuC^wv##H5MbWnF83I zr-|$}E78b50=!-g`BnbkNi0JEdRBH4(T_->-uK~>t)~4Pgd_P{-De+ymaIsz3bYB` z*8PX<Drc?dW9;X_~?e=}^wi4kBY7*zc1)1x(dcF!o>v7ZF8BK_Mj8dn+L{Zk|#|end zR;x3Jsz3n|Ci(!Mhxk1pbKH{Nr|CQh-ky6e)A73PYYuOVP6(9gbyMdHO?5WKBh*Wh zVWxrc{ls5R6Ii+;O!S=Fz>tK$Otv^5+0Ck(zEiiHtZrd9>uo{#PK^a$dCbt=bsk7R z?N8s}1=S-E6%Ky^&|;Qq0E4wF)%bMUTBZ~VkA^`KdB5s;HskcA@~43`AHY1gy-tV` zenyV~_t^w3SAKL+&>Ptrcrhy{>6dQb0|`eK|ln^vUW4uIZ-v`u9g& zv)Geu27_K$mWjO{vrTA3EEL#4J!!XE7C{Zcm6YUXJ^ro!m?koR&d;X#QLOOA?@xLx zf3#I~JW5X0K${-^(eDZQoZoiZa?}kFeTRfnVq`D9K!?-0A!ruhQHN6iK7zmE>GAF? z2N92*KHK+lkkaCJAVx0Y^Tg0stc=135TU0MO!7dcM(jXiscgX^&T*+n))R{S)hMG> zxPxhmyY_u@ScxY|Y47$3)A;mdCs1^m$^QhXLP`U#I9S8U%TIx8^EHNB4fI9AE;=zM zHOwv_gpg=lcSq9TZq(WDKDT1#sO?U}o`a~$?Y(hLPgch#0)Iu^jz**0OB zNueyBc=y->6XL4LP0GDpDHwZH!pp*=* z=sR+pqU9jbU}e-4V_Df;ShAYjwDfmA)}q)!t6*x)#9_gm12|u`&Nq_t-p1u9%Cj(J>kq^^yIn9n=d$4Vma!Ry3%v?<6W}kf`{K zD}VFjOHf{9Sfv)r%|`u{;lBb<#W%}jbgQVJkimT^hI@t;xgbX0`D4BjJ31bScOw5M zgyTRMOX+p&PEHcU9k%KICtay$d2d%ZK-Gdogt}c~Z6IGs*P=tgP>SVFpbTh--k8=- zv~+)QfjW%jKYhw<;8!I!5gD5LCt4_B0FpszDOVfAKb=Dz)yx4g5`W&dvd?qd za5aQvVN1@{-?sR%cA8Z4doaMj(G6sWGH`Gm7BwY(L&f0CBkcPA`47O}6}H~*S(Y6u zmW&BqIB7?I3e+WrK%jSZ0K|$Aq1-|G>I~Zs^_2{#vsIo<-`5BI6m1tcH9i=W&hL&$EuF>F{k(5jPc^`~pWQn^|nE1tRxC&$D?vp%-L$ zZ$pkbt4`&xu1o#T&whK?eBF{4!!KCdb3R>G?|HmEDa$T2r{o>z>ayFwQ)SS)JDmBl zt{}?#iX?_-FDxyDw^}L|g;UV9DM-&rNH|3ShPM0d{$gInk8X z*?@%49_aj|K<=Vz{@X!O4%&ruIakH7i^SmFnlIssMDSEax50U*ZHrB;DPPI z1ZNnxVLDif1PYnSALgp0IX;gi#&cUh0xLL`n)DBW`AuoyEcz#%3T)BmfMcj4zPT)S`C7x4yO`+v}vCdts2cy!79U0)Z4i!4JLC(aycc z=56}7qd8f$D9cFY%RVsm3s67Z@_roFzCoLJ-^==l&uLInyB>fbD*M{c_bOjdgGEGX zZrl6AIK?ef7-xeSO+!Bz<2uZByM9U%BAB?e$xw#KcN3TT_~%x%0Cd%#Mr%z3n|jjH z(iN8_8D*9FBV^xuAuEK|jp6l*I~2&B-qRF%mlo4;VLUCILA|P?VVb7QcQak_{4UeF z0N1U&xrIX=D>;vqL{XbZ~1H_?bsqWj+sDEw9PEag81+J00YX#GGo!f8fmPRW48-n0_Ib!D`JM$v69vSibCV;tfzWb6-)4J)ljR zItEB0=FNAvas>jrCBIbs2!xBrnT;rbzFF=Tc`H@12+H?h>V`>~xWCxy0T!A~iUlg{ z7WZ*=4>MvuYOz=hfk)7wK*7)-VfRzzq}GRYld5IMt(T{og($i*o`TJU?X z_^zyv0?*fApULa~oQR-88N+KcoU-&o@HB!y9~O~+9=sPI_CdsJ((2Dow7QpO*W}z+@05Wz^iY=f+8ZsunRJ9otrNyXf*#|$4rgOR4 z@Z$^}W~pJ`8sY;y+LR-nI3@`+4YX`Nq3<(8HMD`r469JnO&y2{<&|piGN$hl>#ogl zcf^GJP*xfuf)@`RpHi?F29+umO5jnv+cQF79^24co~Tl!PS8^rOEosha*aAekeq~a zWYA|dTF0_saNeC4l2`!F_okl&kG^ErgLKo6gIhnwc?6{>7Qrw)Uh?)O%|1@$OP~1P zu=S1PoG?IR~ENHeF!gZ6?8lf8rxyg*eYhqEGzN zU@r4iW|dw;3HQkCq^H~+j1F|d}Jqk@dtPvGNEPlc@p);{9iQ8ht{^>}Ql!&C}@Fo|ygo zoYKri1K~%*8oC67;3vsG+c69V`De!zOi^ql>Zj%!SXv2Ma~Tbwu>8rT!LmM=@V}vh z<^z(ui`#abSingexDUPFrNA;Jq5qOb3iJ@6C}DX?JmF`=0EX`YMm(e99qm&-W!e4_(?u3wW7?BS(Bo-)Rpb@}d2KCYg*7FDxFMpLzbk1&n2>Jstp zl151R;C}zS`f%|?Y$ih5!;x5BeV|A0rYv9Ejz+4_bC((-#-`Z#?Cy1cGska=?*>xa zo^;l9geSKLQ(|YTSPmqJ6`~HjE``zy3*SZEymiCCCCdesNQ>l`3AKJce89&Z`(VinmwWRfM^bwHI5U$MLh^zYtmUVx3U}h)NE} zA7FzHw7$QZ!y}H^1ZANLdx#rBR|LXER#cKrDv~hW31jXxNXO}2Vsf|~&kOTDkK(FQ z$u(k1>Zutp%Tt+GY?{}ldQe`0?+Li|bp<``-7!Q>o0W8D3ufX^wAwXsn@ zUGt_4>CA}YF``!rl#Au1m;3YV!?~`*ok~mfPivAF;8$E{ZHmKa>0eptyYzx=OC~L?NB7>p$busO zB2a3hsj>K_t*TbxEKaFeuS44ZL)FZf5VZa*n3pi z&Y;BsoNXxSpK92|(XRQn*+GyK=Q*vsaP_>A9RexTuM7qEy=B!P2<8L#3V=c)sX0o};LrIR64+!d9wu`Bw1m$GFZARdw8rQRW!+0sL99 z;aZ{Q?T&EE54ssF=w=Xx8-l67?oL-UvnVvErlkQA*))~IvGCnPjUcRE)TfR%r%^6=s=^BuN{&AI72CB{6(W-|@W5_3#2rH8V@~NxwqIa=b1jPzWt?l*)OBxBiBI zLeNN|s+t@C6oV`ZOkAE~kt~&Rmium+KQ^y|3SesrU=N$tIKTD;Y!F98RAUS*h;!5< zmpPuN9Lmahm4d&Wx1!ier~9JMnpgdg=C@6<@C)B<*k@#2+WV8fr(m%HaH7iDtOv0# z1qZ%e7dWQU);+)_9I$v>P3H;0@L=pEdr+(7fDB&iu+zvue-Fcsf#h_t^X1JyS_rfSY^^Eyu@iTmG+dVxswz1U19#gC=htu8gtc%L?@ zQ7L0ZrRdt7s+oa2^W$CU#a4e=y$LuL96N>>*k?>mfDP#Zp(S)kC2#RA@j=ne26ajp zK%i`uMTw1!OLc@3LZ0s2wCCgo3|`=nCS#mPu#+T+QkL zP&&b$XYHy|A5yNJsnS?*lypP^V^x0EV+|Mt1%93j%i&sQXLe!1r#4&Cv=zWlroaCu zZ)iQ%>1x6_MY~Yi3Sb@)p8-#myi6m2UV_SRcD~H@fR-^3*dWVDL+CMQ)mxknzK4MG zcw847hG(}o!gi$+X*zCuWO}@B8EelJamcOIjrZO0I2O!WEm##S+eNhhW2LVDCV6qzbQ`ruIQd$oKl-qGl ze>Y&6s+=oEQD+i>pzl6Jis-3VUIGkSGu2y3syZbX))Qa(UX-?a_F78GDv(AW?jv-c zNiw}}7amViPTKrGKX<;~RM8s^#DaxT=6E@z@_M&)cZF^Pq((T-Ojj%s7J;Qf<7|Qd zWvKiB0HEu=BKdL^Zp=ykRR-%13%21dKiCJQ2+_k%`LdrR;=?cuNZRxHYJ{bGNf>n2 zT$ZfEu@K1^rRM0P4akeC!w(um29No$xm>JBBI)C7;|*4s#@I0Hb*a`IKD^ z{ESa79z#}D+$=5pjOoQ+46zCO!80df2XUsfZ@qr*=5t^|FkRu?QsF@MekPhUQXPTlcM?qZ1$qX07n-Gx2-t;$2qxq!- z0Y$?Fh!KrsUe7uiU=Xj;tp2i>6r2z@en28S0#~A&{FlgHfn9?TU+T%fzOc<$+CjaB zf>{V9v^}Ri6Vm$Z!ZM3MO`bV&oX}E&*lvF9V1mOp%&x*RW&BH z{$EuMHI*OfYMjd}bwWN;x*(|L9M5ED!aWw6niJg2HPi+E5c9>tOd$)z)Knx;nueZ z*aP>8Y*ZKvE7K6o!NsEFq!MsMV+bmITJ^X$TPUQ{7|FFf4hmSnCAEJYp8>SPIp7~C zTdP(|%m(>i1lpj3fU=8*ZKw(#m=$rTX7Y%LjF{WvVOvgudKq z^!fqW5jYaw->s{EL3>0s8N$Z2LZ}F4`@TJ`9sY6!x&DA4TPBPZ*M}Ojw++~jn25(J zft!vCrwX@WI%g%P75EJORkprVCyAM;$As%A#yCpLJCcJ^&TQbENv8fLeyZ0qf#)$p zMM@)Jt`$41p4hBxIi!k0pD_naa2fl}D|O=3B4@hjst0R>B5gsw0D?G-!bUs~cHuY+RL5*Bn&!><<388m0z1s`j4hE2+i>tox^WJ^NtOG z+}Q!Uq?l)UNNmgW&w!D2mWA=660cA;3kOr%X(H8UdmwHj)i4?k*a|#Xtx};;GR*>r z2JNDU-%k~8GMIkzdOt$_s0PG<-_MEuEoII}vp9fXH==|XVhk-5aJTAXVS_Sgx-8`O zS1h$y0Zx!V5eb!EPkbtq!;x7B%L63@v&Hp{fVS0p$XhVoFgJUJnD!98FiFs1G~@j zy<60`i^2>Wc&w7`{M(bI{`&GeJ^9eXgqxi5yPT?J3nWc|=Tp*#rvMqJrNx(F^EEt& zmv&&M3H(j^yC#{eXZeL^FomIJ2;-$`!x!ZOq2J>bPfh`ba~2vEzlJ=Aqj3$G5rygO z^k*QW%&^e?_$;mzFyzL7O(zVZxFmn{GNsx!%&1u8sU1AgDft=(pVjmeXgGHaN*Oi` z9ytTSVTkC@F&0+Dt>{$I*QO~VOQE4ugRXH(EIs}h?o07V)9T?Dg8jX1a^#&^-P$fC z2Cup#g7ybae@j55NPfq9!9?dQGn!9!6lw%R{l($MtJCR`^ge25 z`Wuvym7@26-!(skE)>r}9I?OqYpQ+#M($20z!IV}$i&R4J?gis6~7);b@CP#L;{$O zE$*fmOH@6Uei^F~FoTmb(p$3of(*zc&>t@}c&C-iegkZ<3AO}&J+7s6`6mua*+|Xl z@A2HfPx-V>8nX84G~y*BND@B&ey?TB7&nk^5><%cIOmoI+iS;A82$iTloPd0YWf?O z`twvdC(SBo|B0L{6ktJBmqZ$k^S9qZ9I5Jj^&mN{qh=>#g#6lTNBXP24Nb}BrP=u# zyyEDFV~>b~;*Yb=Xe; zD0mE9hpm?e2~M?n=EK56XFy2-*di=i`*3$*r@uf{3YM$7wc7!B1C%)*0aO7;ePUbE zfwQy22~mK;Z*3Nz!k19+=h96qw*?hc^VZk@-}_Q}{y*+ZNymc#=7hC8W!^GM>v$dd z8$gf8J-h;Y80yY9LJ3V|&)`7$)(2EB@cr~Ci3$U=F$6I_)yK8NnAb;ub^olyGR?Yi zfuaqVf%@U($)DyopUygU+ika$5(%G_(nC*y7&aSL$NW*tPTcszCmVNOmg4 zhLlZMB?9mZ2$bG;-x$g9e0NE?owrm^%In79+3$l6K9Htv%&6+(1ECzKD@GWHg8=s} z75CK$(_sRVrMRKr{re@L`~YEp`Ev0X5UCDN1fV(yA`jG=K(N*ckkq~bP{Htk7MxS1 zulA+84G3vi(LnHc*mQ)nv!d$QcAE3O`0{a*ph@Bj?}{M6b{jN;oY!G%4f_Y2qW9sz z<5t202Shr*#9!wCX^9A=5@dW%Cs+_TKoLt2$P@|zypNV}cx;Piq)}&pU|(5M4P7^* zqWNxG`Z@Ihy&usB(4Uaqz+Z=TU%$JHhr*cb13NEb+|Ma^+|Knz>i}8h-Njykf)KFd z!YNibz-}nokD)@R-6J@Hwne-K7+E?5OrkNF9)L(i>0L0}2UKHrU^B*a9UJz3AoCT; zF{s+#CC{(~Sy_ttK=n-5EhrOso;0Z5Ur&8?W*y_FSj`G#4_8`K1Qo6G00HVuk4~As z0c=)5YWKW*|Fepa*UQZfpw_QlM;Q8{E(*(!80WMcW!ATN%nFiJ;E{Q*<=7v#6K)=X z24mbq-s6`-QXZCtqf>@Nv8}Q%1G_KsfHGNswp?AHO|*k-VXO z=qWDb@aabe#Ke_=;E$Xy7BjmaM=Dl&$e^n+1X9P9yN5rX>4O4@`rj^G0Z#G>rK9q? z=+byj*VFR3O?(azq_m*t2OELwj|*zQnbp)koG#HHBu|;Hf7S*nj-fwtA_AaI4N-A% z)kIhcZb8wOG(ik{-S@GtxqkrwJqQ4pWc8D=A)R*yz;XjH2gem)QlTTMW4rdzAK@jw z5bBptw0pXquwBeo9R5Q8%FR&ayhR$r{T&^L;^SW&{RXW5VLyga64;P;-hhwFgSl#u z$#iV`$bPFY*k+|ASR#SqfCW+~9aF=MxLwOAPUISvDL|virb6sBc5%{Z)+QN70x>_( zDVxyeo;d6GI0BJhij`Ru%WT+ZD+50P(a#!=qbxnBVt9FjY9EIS;Ls_8fG4Gm#O1K{ zya-xDF@CuAMqM9R@Q+lB(bBZ*?2wUgn8jnfq@eV%V!0BU)H^B8J4HqGf-zM7JjVTu zOjXiz^54z~miB0b3*OOp2yD3koDa1`d?3WgL%NQp2JMHv4j3tNMbWWzay>S=z5{qw z$Sc| zW7tV+i<$VW66pmk@eeHDprt(Oy;(R8KSG@MOX+KoKLUrgpW~QVzeufqt9F*5i{c~t zMd>X=X}2MnToP_SJ>Uno=o4htZvzM*nP~SMc+Ner zCoP&k3NKS+P6`Y6mUu-#BV4RlpHFtl+$|hc)3Jf@gYmH@vpdedTdx#~XP#G$n}~;m z8}gGTTGuB{`u>II`)bq4s(a00-8$S`}ZrJ z4j#f7@6^m>hQ|};U!~!0)QC#LmFvBF_i;Ol;L&q$>-hN7w|+t406QTHm2wu%6`vb= zcc3H_q@Q($hkqopdgD9fADhir35*s$1!Kjj0)Yrwd>8p4 z`(F*o9UawZuYzoEVw=cAg-XY7dVYNI89LnPDApU*-`zaFDM?eCuXd&C^T`QyRePV0 zU!~`dh=_wRzx?113B{4YnMGgVJ-Yq*p8rC;zethW;^BvUM*5R3NBfP@p-RneO6PdVpj#(pxs>mPl=v%nyvn@;QjB zACQuK%Aod-7`yFVy|@T87gzoYi>hmX+D$~_a$WrW{qu249{jr8Wsv8!d9d^ zWd<$R76CIRz^K>;*u#~&$E(x`*B%vmZ^huHc|)+o{1E;44ycX3xRbMDvtArAobo&K z`2)0i4SXxQ9aV_njq?b2p!Z4KPN$Pn#~bnu$|E6)k=!4W)AM|44UCgmPTaxvck!?b zc@_%sfrqGCAx=&tUCOnLxyio0K4lsRX3E^V<6)#M+td!C2^ZRT>aB-ad^;G2X42;X z{K8xPT7?gwND03*D-P>N*fouZbFuG3PYb({fQeI;8UNZ3wr~oavG}eo(6a{cwU+Zg zwLk_0edw;b7k&a9j#A+}GvgJh-=66k$6n)q6%_HBlybOaqZ3Wvps$yvreG02XEU~{&q55 zuS%GoUidek-eo72<3zq{-w;vK&G=R|93C#NF1R6%-#d!pYp z6d+hhb$EwM=~UcwdPofrY4!C>6?x_}t2)$ZjRygsD5*;J*j<_QYHY!xX)i6l>P@5Q zdjuf3a)m`@TYD>VF{rN~7o+_QOpF@62hejDYw6&pErVehmF9zOb6{>pEVZ9W{lM6z z6VxSt0E`XBnFy6E00-T5fk`6-%y7Dw zSo&Z`e%-9p9estR?vsS)Paa^$hKDOS?IW{?wj9#gmz3n&xOXv>3>=eRO)T zfNkt$?0Yar-R?w$C>MMatn4N{1Kkr4uV!Fed0ZXuKBcHjCbb>7bj8i5eD(;zYT58E zLk+Ce@ZC!p64~G4BIWz*!V%SM3f|vRkdJYu^U^o2i@``~fyGWl*-zrzPO4@0Tvz7Pa8$*1|*{lIkc7Aa3VYC&5~nIA9MrBc;~ z!D(EwER;NInKkjj@f&4yPO~8;2OohvYId_lpfzC+Q$7_T8k=5PDiN{c*fG=FaXOkk z1mqpdHy7vn4dv%|yCi-hmjeeb2O;L8Ec;TIz*31`CHxyTnZCc%c>3G$Sj;H04({e$ z#XXpj?1k5+VgbfGCV1Fov12Vhw9(2O{}+n2L$(9E?S$%of5C|DUq0Sk8iF(u zw}?#@rOZueB;}i>d#Q9#^Z!A-!ZuGEoRrL@fUHa-CL?yiG{l6x?E&eJ6_NAbBeNwB zYg-iYW28|iUZ~1aHUTvFv9rfXjLeU7p7Mo`Fh=$UA`?y%lPF=)bj5TVH$O=!7Hu~H zF4OFstK@wA|Cy`YeJiMO!N4g~(~I`GFEj^(hz&ucamXNzgE6xqHALwd;d3Z-le`b- zdsPz`R1l?GNJ}7gD7c|Rn~vvr%n^S)A}!1ND-jQm&+b^@$gi8T^1uo?CeEiLMFN&ocF3TxuhInu3SwsWEI%H%!{A}3m@TYd+;*R>O{ z5|1p6ea}Az$r}MkX|(+;QtWwc3SNJtIeQgg=s&5SX?4VhcsU9Av$BP0ddZckuJ8)5@7P z0Gewbc*_ zLXx$H-{s34;sLnQy!ja#A9kT)jySS5+Ik~hA)lD?PiEy+GyIB{?X%PMGr?%SH(*Qs zQ$<$`c@(FlCG(xpG*>j0)S+e$hbw@TttYk_z}HMtk6NViw}mqJ=8rP~5jg!)#b9U( zatVIrXU7bccgE1sAoaI&F3D(~;(X2=#KEbd(q(ye!@h;O%7W5td$oEBE0RIf#Y_}v zW3KL4HWITRF30wU?j;?;O?=T*-+gxL=lRm{}$9F8W8MCjWY?oM>W06rFj z?O3+?5Q8)yqwV9Z0SCk4Z;X5tIK4}IllK`Ke1WEJ1!E~mp4ux4N>Su{A>3;^#$jaR zLo1yEt3@8JzM&mk713NkxXe_e&**br6)(PaT=;Y!=UL96a?`O^Ljz}3t-nq6pxIx~P z6K)P6v%fk{4PKf0hNsOxfUFZqSg~YU-}#q%GlC1-YX8Rbz&~ zATGwPc-N~Trr8@7ftw7haP2B_5G1bpr0c;ITdAa)Ba54+{iCGI{yfk zxQ|5316CbOyXbWOSn`Z}KK+JeMI;pv0P}C{C`3YebVuoaQvUlb#c(})B3`_8aKo84 z>Priwus;U0ee{#Q(=ifdVinu{eL4W@*GbhOO@iap-@NPO86pYT$jYt-k1KibCze;t z=%(JAx6H1b0TlKjpSTKGCHdjrw3`Ay=K=aE{Mb(nbMmyB8B+20F*y}HeCT=2p4FiY zfD9N(?186=HsNJNEbh59p<5BY;K?}85PQW7;ma}NYdE~|5O898@l`|+qHG$t68m0Q z!eSQpsJ!O}KC(klLen= z4lRoMfe^ehN}gC!CmW$$Hr|{7Nq5n8@{R9wezxW@q5?O{ITtuk;RqVxuGB81<4Zo> z9bt%Jbd?I%Zq?N@U`z8oT&!y>1D-k*0%sH@87n9(6n>U`_{#BUap;yGBCCr=djS2# zP%o5-?$L*64i3(&nAULXx9YDe*O1;a-Te1WakPZV%`#KLKw2b{3IqK zcYSyJTQwkv8z55;rg5r~{dT7|O|I?9V;8}eXu%KCrj~PuF4N4(-L57 zMQt07DF~j1Urire2D%x^snC|+8WXBl91>TUi^b~ z=ADV3=)KC_h${;UBj$WOc1SKr$;ki+smbsZqIW@TRkyNF#paVO<<3q|go;5m#Ni%I$tU;T;3$R9 zG9;Y_t9Xu4Emcf9ol$j)L#Dhlke+YSae zy&Y|jMIi=hi=F3x9&K67R3VItj}+kw_I?8r?*PsAG6pXs%^O;f$FzW5EF~+>2 zDIH&HTxZb>YXEGt6HDn^E}Jl2=zlFlz2$^6Y=1jP=AN@GIZpVCx7Odr7mIsx!euGS zKeT;Ey;q*f_z^qT(%8(%7p*cuI z9yiJ?s^l~K7AxvxkdM5ZPZV~u&U;tUfj*0)uY4^r_D;t_&+Iu?OzTg<Ylb@gQWaQ+6JIs_1DB~GNv$}(LKZV5A?9+E$E5hp;I^vT? z_ofNSc_5vT&KHrwZyjON`Wb7qjMV1uAhdsb*)6)1D8_r1uC|XrhySfWT`!{JQ_#5$ z!J28Xm@1iTgp;xux8*_9b<%5*JJk1;5{jczB+ibvsAJivS@Dm=Fs7isY*TM7Us4xk zQhgD0*W z#uo)c{5$42Keeu)R_Igj(AuoT?QkyK)>V~Tss3tTL`n<0BU@GfZ({ePT@;@zFOVu7z-76l&M?dQ_|$2fxeoC@&|-UMec zCWKx0u3VZmB)Cq-byhkto0`$vJ~}Z6ckhUU){(6k*C#6@rszu{+S#;!yOiEj>o2VX zD;(`^@S4^RV=S6P&v~&1CKP>cJiEwX%N-=iYP?2T%ZCkE$tS9TiF3%P>Y-+`cBM#} z6T9aYA8q(J%Bs6sE!QcVW2S@g>Ici$?qlH#X=Vo3@4ovv4l!fzQKPpvwt@|UyIENk zEvv32gjxj>3g0G@+Bd7(PUAI~i+b55$My1lZ)vZj6u3K;3-(d@8DO-f^|x4m$Y*YIegYG8e>?GUnFfXJv6D5D7F9RlC!Vtq?y5a5q<8GyYPQ<$-$=p*Y|i1 zNi!jl2@Z$aic{>>07PIjbnB@nGJLRuysH~?1sQ1oPtx3@su`RPl~+h(nkgM>6@6FM zNxdXej3S4172n9QYSjC{rM4V6DP&__&THxD-l#caYF7LRn9}P*L-9YEf#^<}IGhSF zUljW{>`Z|eQCqDtoIw*gN!b78t zoN53SePXnbe;Up{V5biduf@dJNta8T61+C#&ZHfm&&=hd!NiV{|wp zss12>HAmaQm$s?P*PmE$1{2I^jS zv;+9Bje#%Ql-l_14QRrm)80yx2aG=#A2%tLVA;Sjop#8V&4slFBdo?~dcIW43hY*+ z3=`CP)8inR?jsJg*0%8OVDf+N1nGihb{&VO?Uyv7tcMiqX}69ug{>MV35++6W%8T@_!jy<7^4%rS>GOmC z$R-sPGC$C#SnKcniKd8cy`S!BvrSTufW?>8s4k3VmwY#2CZf&`HLU+JGbyS6ymWfu1BJAb~g%&RD=W=KhfytIEgr}C1`!B&|%5@HQiB`Ihq zUJ!rr3&I`NULQ`29C_=C5WWvE_O7b3n#^__weG=!*45jo&3p;Ww&?{A9g4lzkaLo*qh#XGe_0eKz~GVjHnf&+$h@cUTb38 z1$Q}+ePmb7scWydm?bqzewbx(F1zSfV4ONi%MH^s0$G3k#IenljrHi`+dI6xd_>`MEfVE%#Ta(V;H%qQm9vc#T3>@dzbYPPC^C;d=Qa=Xp z?i261wX2RPZr!K#ad|n5R+x9LbHQX-vFN3xmszixk?HuGrKnrZm0uJ$>z`kh2400r zU8khm+iEv^*%Hb8isD?z5%$Tj2|$sxa`nRwh2mZ|1)ZpQeZM5K?+X3_1UynplqC28 zH<#k?G+xryp-Y^KESx;6cCQw$un=px8uFhr;w(&>?+JQYL$=LqFe38X_n@IqSm#NiUTilbw`jYUeqjT>Fu6?8g1Dy!n3SEF>0oy)n|9_&0lSd3G^uKOhoCLmm>M} zLwaPlK**f9C!pejsIk8mf&&x-5y}Y)jQ9T19$>YSvcsK2P3UhZP`XVpLivJDEjM(L zPtakkxVayvem^-V@=K_%N9p!YU?qV`AYON?$JM3bTuxy^3E@?~{AwA>?LZBm_8- zN^1gY7Gal?X|wEw<>sOlBJhT#%*~iF=uvj(Gz1HarV$Gf!IhlF!#2hX@@U1R+^SN$ zqe7xwpHY-o?ATi@G_X<>#+n3gI8fC_dh#c?dlgHGTZp2kkYc;PYC*T_H47A^%s@$f zwGP3hwI*Th@(!vpu9Ns{*>Fu&ZGE77_8NvObKsP>bq+=lRneE*PK{7$_1g@-zd#ds zo%^TOdx&H`oFCB|S)J^R$C^`$4T|V7?3*R?A-wMD)wBd-5g z=rwJwb8z^I--=s+w2P18lR%e%X7Re+m5B{5gtnKl2I;@lmBJFe(`G9yBMNiD_z%B1 zI3K3BuPA*^ec5}wGi#vY21+4Cp*I9~sH4P)0-wq4;I~BM~iJ#5C-b zlQA5=S>sa3|1}hX6FfwYC!*3Qw>cMbfAOhK3`$^rH-6RL;*;BwFlU#yryIAf7Wtbz zF2(Okxni#14}>$Uo6Z(epaW_T}V9r`5olr$}Ta{vPbRY3~wC)tQz!xpkyQ zMuq5h-#ZYHRlTHUx?K3J_bYzKLP*b^mMa~nNX+t~S?j@4fdgd}MUQ%z2FrwX_6Les zcgM&D@Dupwh07#ZnR)ZHudTy~Q%Fs73~;?g#NtofyeA>oF6$ zl5IVNIuK$3!Ncqk{bcOB)b*}!W332*73^L!^^Rpj02$)>vL0`oM^k5X6)q7uMD^BlO^+i za8AqJknvTV5kmj(dp8~IAYvPEOpxH6M{IK-T!QWs56geRXF2mU0T<`p=;Rp3vVGfc z1rnGOimJQ3y)SSFu)Jw`p1w;|js14z@#1+hMJGfwp=M_?svpzdPhICHqXiIbNelIT zVnJ=)Ux>8%M?aE=7Ng<#u`u=e>642r)PQHMg@$9ONQf+~sf9aGJ$_g=1@IeP_Yd?H zKcT}E*hX{^=rvKUTyBn0sFY_K&^lTbLFxPR3!IAkUEVm1u4R(~s@GOOJ!lLfdsf*& zpR!MCInb+>t+<$;3LH~WJgv>=F)NP;^Py+uph7HEq&{EsLX^8l5Mkq-A;i0F{}$Nf&+e z?G_cTvLGis*wvk}yRDCkzOF=>=KB&OH&Y$cG+FZWC$lF&CWJtWR46cI1xE=8$|WS= zWiy39oJf#NV#_q{*VYF=tXhL`&l4|Px~ZhRYrF#UBHO7fRC&~|>N<9?w}Vf?h(sFo zH8Q_^gMa`KEM`Vr&FX!@;p5xrB+mmzUprwBp$U5awc0^NW7~D}I=50U`jPr0LUv0N z3Y<%U8=Zu|NDYslU99(bvNFM!_Gn1CrT(r1`nh+hQy!Ud@o!Q(MEqh=iqYX|n29lY z#WThFr)vAoBe$?|vv+{27TRqb_XQE7LocGO1^lx9CkH)v7 z7R+Iul!~zHmp(hTf>uTC^fRX9!4STHii=6Oh%F~fol|N^s8mDek$CpfV#w{ z!Y?$Y?WIwb*xMGh?X5nkZsRf!eDJC+vqs6s4(F*2vkWDXR_uc~Lot`9wH>2eG1tda z%k<;pRYBUR`*wx^Dz-w*33+}`&O5|dXclC4_bXtJlrCBUU$yE9LkqOu=0$F7N#OJ* z^Ra*-opZ)#70n z^qjZqLRylt2L!kqQUsG)-$$0+$E2~eB`_m%G7UC35YNSOZAnHnjfnba77|wCf@S)) z6~|0Mz++?j*^=lJH;dX*O$peZwx7H6opXN1=f9_{kt`^D2WU_#ls*D`z01~p^Igum zlsJ=*Gv^)9_x5)QWHbFSu(4PpY9B5XR=wsEjoxlLAQ6R3)mBjW7oCWhT5&WVCv%s7 zW-ic_MrfW+86?X|dl>B$EGWk1#Ju~TmCEPI)Io87lKuxk3<65cktFr*Hna?`uejWv;jz+ySF1Iq3-bn`|E=!`B$b(Da7?vV4RK z?1ddGw`F1dzE@Atc0sb5W!a^+J(c_}8b!n|U2^UfjM&d{?UV}ArPhQ5%QawRQi?=! zNnT`?V|}Vt2QQPY6M5g^S5}9F+p9L_^ME-M=A)RTw5Dls55Y5|TO`kQ2`JeXMnyhz z*k3se5|Y9(^s>PGhx_uJ#+$&YA-8z`Swyc3NL_}K-rgmRLv}-^?$8e0vA`hL?=VX4 z7)P-_hibs%2@Y9H-{MoSRTg`2FSCaizi)R~BEzEW$?h$BS9gcd(Q2GfO^ME;JOcsm z`4|?1%0_GQ=q!F`ng2y|W+p(D{JHHQZ4k?8)jn+17HPoUbswut8W_f-phkh`T>&$4 zHIxq&q6j#}5y-3yfiPw}uCaG<_jGS(Bhmmu1UxHy9w<4Ys8GZ?c}I4=}|^!&kkv*~GcqHKnlr2LO|T4mP*l1wt$8 z|66w`0u8kk$VJZ2CjO&7`akO^f+0AHyYSa){e~iO@NcWzYqu zhE%D|d{qcW4}_uAA@MI~E1f!7dqQ6PGtv#y{=as>V!Dv@9@2b@^LFdV0d~NBB+*oz zE`5wNUlhG|x%5<3kJf@`kkpnJ(`{;@nEWFiALrfleh|JO1+5Sf>FfUZ6%F}DNDuyu zn2h0YiaPe-XDjB(<0-aTul9i(MsJd9-?ac2jPD~Y?Q5??5eIA=f()gXbynR0`e5`P z_7Ux&q1p-hDL^Yhz^~j`X*xSSx2}U2LXEMQ2LEP1+s>D3UtC56p&$oM36e z5%k759@d~xdTCh?(CfK;3Hjmch&~R;QcLb~jfWC@3Ej-k-oJkHf5jX0Vef} z!uQy@7=9gOA3&}}T5j{ct>TMvGq{s#|m&cug!*^n& z?7B(nBu&NBy)`skM@TKwVYeX?uv9!>$gz}Q3PJna(UsfZuEn!j+D7yX*ahvKyCBjJ z*hj~&F}S&2zcu26cSC36d*mmRu#_Vp*yNzO37&!wrbr7vkbUm2yyOn3DMR3p60s1z zQn}lgYd&7R2ljV_LkisAAF8p~aR{K;zZ%22^G<7v>X$1`DsV|4(@n2rV&!8wBp|#0 z5>eHot@qjm#LL1R;UP9LfKYyAIH%%TVTvEI7@PX1+bSNZDe9QTT~O@L+>Zw4|s z=_kiSRMn#SbhE!XR6|f&7&+D6(hY%(I^vn@H^bh}QLhxI-2qw=VqWJ)Q35iFp4|D^$c1EXL7dYTWFkDO*L=wdhD4{JunJ$yoVNTghBU z^G2?&2|*<&(9C2GXHdnb5U{}ZpnkJ+PH&uRhzEqwUJRii$@%2ga0L$F--B`b4OEH6HO3Xy=Ws85fENHK8Nhe#~8nYYD>mcJ@yUnxq7JDf9 z+5)SiOkB(z;+kZaw*{yZcGsNY>K6||G0E#L!k>*~AvOo#(Hk_?gI3Vp=Lk-q=dcmy9Dhh~#mbMoq@arjy&8D}!g4og8pY*3|T0QLm(eRD& z-|wfDtQO)0EucO)Q{_|x`OwKOOfkqJMtKj=hEek|%S!mvL2q|R_7=}uZ08q@4_8qA zggZWVR;yHIUmQPnZkpJ|x$KpplK2PGVEE@)%qe7(6&kM*wkgmT;D;|D-ex-S$5DBs z`TM*TN_BEW_CM;Xe8m;%S#+Y4=Mgte7KBOEnLGx`Xwj|g+!jHG>?laPP+CTTMJ~`_ zl)Px_Z*@CMC(TM>i#j#sKqX*~SOo2szhyII;7k);(WU(CR*s_LyDW59yvG=M2EBis ztDiu!4eF=eK!e6PineLoHoiLoJovN8^rC#92H14;;nae=kYaX2$4_-oPNNr(Z)&yL z@tZK#7FzB3ouY>6O;~KV`>Gl-u>f@&3qOqlMCx#&Ds)FdHKdUD`EKc*3wj1;NUuF5 zd;oeLJj7NPiQSKuPmg5Aq&IFPfhav3X}Kq-|GVYlmHofP{uArIdmr)i)^FWJuaf_# z(HtX^ADF|@!AD=*t4$L|Hr!+AcwBFZG?u~h^`AT$PNK3ae{cjT&t3Qt|EFx>%np9u z-J7cGycy)Y8smZ_&53FhPlOHD6@}4=pyf8MM<>JoSiljyfZiA4|4W~9wV;Ob@q@S? z#aa)sQ}9(;-WB##GyN3iB~}aBq$527M|AYA?W8+8DytFJ0yDr9seXI;^L^RlTp{Ck zU}q5l54yngbsFp!`*!h_ljae0`kjKXmVGrQLn|^@pwgNkW)}V z9rTXu2SnBus1n%y_>ZdFpM9YDzt(%s2oEVmHr5z6j0&gz&38{>6)4~+P3eiW~Mi;EN?n5@~s)^jO0Tc0Askycd2XdaNF~7YLq%C zbqJ`*T`|Zz%v!OtvBs}){KnOY+VsEEUu0@WcSQ8iroZ>Kc)EBsN6Mm9gU*w)0oCz> zwI%J>KUf5QMnh6xN%}+U8D#}VHhzCL{;an@A8iJ4@h;zQ3Ah-pdUp~08wGooJy+g7 zU*e|H$OIpESW?2tezlyTO`Op4B1a-W`?AwKr6=l z^roHr*6}f5A6Jp&y}yW)&ijk-knpT)gxQ_H+waNN5%r>HTeYoDe;_A)MQ!ZkrauCr zagSlB)29m%Z5b(ttJlZHNsZ!%q=sJa0i?KmCB3%i*WZ>q>|iE=`|8QPC?drDDGb)L z!M7R=Db=T}l||M)Nx`t2hhPwiW~$M%StvaRv`3q`OQJ&~O;n2t@gq@3WRAFiLxH-} zu{T9P>A)ex+Xr?Uiat63V?>&r;l6l1l_>N#utgGzs zjluajSfu}6c$RCt$-X|(_3B~kH6={9WN`PBFy-jwApiquDJv>^@Vl!-#Jp}yGrn5C z;HjInL?0q+4+SC5jIGHY^)Ok&6+Ig+syVjeNt8+8gOo0bgLkMhi%XY|VG?6*f2s)Q zL-l@pk@QO1uki+9#D{G32@Xik{6sO@@S2cD_>a($q$|yvDm%&D2i;n=r*m`nVobG$ z!QDa*N3W4ITf)d(_et3l^6Ipzdfxm90XJiTxssiEl=lsYTDT_%C%HJ=eLy63&C<_g z-i|l_H4E7&OjOlu->GvyqIoy*obQIVvp0;=BZZVNlS7mO43x?IK&qyTK{9H{{d-U_ zdZy99u!@&w-QGLmMHBSK0xW#RzZ&*EAA)-<~C2#PFrF#(^`B20(3K15#O8S_p`R#^mHfF{Ok)JLK0 zjG=PhRaiy)Z3 z=A_cBujL8sL653by+NuMof`_>b3=n^aI%D?OeY@qUdboSHY-q0hbPBfEvL%J>(X`) z2&QID@G1voVyonh;IFXbcL*{F z$H{!LzVWLq0rm*x11lfo_&z(&ug%@OA2?Wvx(+H?Kk&A@ZNdt8r`?RDWMINNmfexV z`K;Wd$CHCj#QWegYx>_Ca;!b}ivZP)9x-C^gi{(n7}6!}6iY{pPJM0miBpj{WrEQ- zJC4V>zW{s~&nq@X*2WVYtCNf(*Y5;q!fq302Yxn}-q$q&cU5^o&*j#mdo7%qR;Bj0 zk|dw6&@Nfj_>-p^2pQUuDx`+`U+f9>49BS4l+w17ET&X@%GJl&p!kp^i`)xT_#1OQ zF~9GL#VFppf5Kii{T*x$^10)tS@92J`~HMHJp<_>c7K%l$_#>y_uXP#J~J-4!oB4z z^zg4eMt}GHa9hpFb@G>OI!)<;Mafg2l6?6zX5eTn@4PQEb_N8(a3tQ{)bDZ2F~5Jr z5sftmjr;4eRIpdMeLrdmG$9`$4usFz!E&3*~D>S%smQOfkh*(bBuD&0-`2g_53$@EbtHxM@X z`?%=E$n@cqKCn%FdB*M#GQcT{9xTUrDm^|_mU@x0H_{`nHc9sBWPhy8qEzu2yZD*p z4(X$Nn0L+wJmWXm+Usw7-u(AI<(Q_?i#4m;#Q&SJcLnW=nP>c8x7T=RKcDQwjbOc- zBC_+UZKzl7c?->fEBzrTqVzfP(s4O9i^eh{dL@V919vlP-bUL6-%8TzgIooZM>HP= z3Unx*u-2c&5Su*yCST*o7dx1yE6rZyPvYr~&7cF(9UL<#f{dVgPReow1At6w5L=i!wK zm*Ka3rVIMHh**ivU8nKqnQp|}SwyA`r!FiMK0$KuT??KUJ*bkaA3X*Qy-MV!&DNVi z_%ICYYN);73k9rt?mUKv}**tNt zTm!p>$Gf@AXE8 zqJ6l&eZ2aj0%Anv{eq zK$s99md6-BKz0yzftaW1xZET%u$vcTqopvzEn)L~^q{mkaDqe5a9@OORKh6~I?<5= zay!M|Fb{?x+G*r`Pp*4)I$KKzZyI2SG4gC#2ciZ*`q~!@^`2XgQX*UjKLM=Cf#^gH zR{%jD+>Cz${lvD(V8*hxCrP8D%Yx)ns+dWo%J-lRLE^G;H+Vxu1Dw1q+NGpF6TZ|> zWBa+H>H5T|OL_=c|y~owP)I5f7OF zF|oXZ z zE&7$Qzk!mvBY9I`SAtbtv`Wgci_E|cpl~*i$`B57#vWJ0*fO__KNybP^qT254mY&B z0+(Z@Pm@()3_4;&=OASvn2UYlcu4-Vdwx%b`lQoH#8nDBW(Le(Q#ub;!`nvG4pGq= zx}TR~=brw-dE}1CagxDlu1!fuFwj14TdM!wqz-N1^toq%B1W6h8aPs|d~tt)CQgU5 z0($o!AyfDg8~!%q^Bdw}k%NdqdGF)xA{(_N{z&GHkMpSxg4Qsm>$c%Yh*ZQfqYH2x z+#gAw_`<>D>iqr}SP8Su!G)zg@Fi*;gj+g<@v4XQ9-A|dQkFptOyu#HLiWv9%G3aE z|CVvV7zkSog0+GxjCVx%9q7Lq(Y1wcseL}^NGC-e3e2R>6Y?-grLA4JHkUbjS2KfG z61v*~@maesDIF?s@!OCC%Z?f>I`BqaYC}<%<;upbBhNtKil)Yp3Q8?RTdRYC__lT1 zG9dbIzQc5M-(a%%6xE1ScufZMU{W)KAEuSI@iu%^auj6uQ%Y&eyh9cR0%R(gUdI(1 zo2cC4)c7p*iZscA3zy$c{=@J42SMQYj&nM-T-}E{hV|;WJH_DJslkdk!QL(oIt~kZ z{l$S)A(fYN7Q_RUfg6F8>#*wOxHc*z42xpnSH(#(GTxq~36NH)NaU6ILq(5B<>yh2 zZTyvhzqi0fQIZ`t{N*0}M;NERoYuGax4JxLULuYdG3BPF7 zkUBslzCtqbEq+egF~o;^cZ9>7gyzSD{6$ON)y~5sjUW_>9EyPXHlxANa2(HY(je=Jec6(g^$T2r$wjl4&kIW9b`LKFun-Ag(IVs_SzH zbX}GuC`${@P5T0|H_l(0$xy`|9fpdAFyBWLo8vv+_2rMUvrsF%j4`ZW7Mo}qkAoJ> zJ8zwpb`UVw8HK=$(cS5`W;dy2ZLsL9827EtIPVo!c?FH`cUW`N?|mAQ_|La|?ZpLw z;m7Q&2U`n83wd|$B7Q}VdZ1~;?Q_!ps=i0*@v?YQ_O4I zmK>8yvfTSKKdXWmgAx7ZHjL3ERAG+aN_d$FcGgYTFuf?_?bJp_mm<=qK^xXo6szQE zi!!z7BhL|aL16jRgOTANo4+0-xVP9IoaizzcU{GMWl^|DP z?bn0jJ+U>Dh1N$!NWXfXI@93eW&&DA1pd~1{dSR&4&s1q6 zcq?)8M9NlVOgH0m1=K!qsIqHUm9~+QJnzyTw%VtT@Rw)K`;D6vm9t6uaC_SjQ-=TP z?uuZnHz{*>(a=kr^!DPjde86*v>Qv$@gx{CV_jX`L83kTP-m3HV_qX2 z=%fUUd})IYTvy7t+6y%?E9S%PyzTGZmgM5FZc^T|>f)bMN?%F1z77iEBJ0-_g&S;N zvt!hVyso-2ijrm}e-88?``r3Wm6Rf!mQ4LqS2ALM#N)2wQNh&hHfs+frCzAM5EAuj z)9rm(dsO&TCvpH~jhTN0TbAuD*LVeboJy&rs3=qVydm$Uwu~bJ-Ilx~r|%?`vn!Xhp0s z8$^5k8e2EN%}pb{)I?sr&X}80tNiiQOhFvcqcJ>YhiVz3g_?H~d7nw|U3aRe^ksQh z;k)tO^iXq6>eH&5-^%SPaYbhp>bO`ijd@!`%JPsZ|s;>!&Oa=Hxm_ zsy~sUiL;Zuyzd-aZQX9&Ro!p{e^#uW&*nN^MWT7J@UBOAqBoJsr=%#oPjX58otFMxem^wkO$w_3Zcj3KhNgSr0$PQ# zf~=1AJ0hMl$fZW8IRVCLbk#L7XX*j3ZyrkD71CEoSru(x_Pv?7TV~+sC4N5iKf*-6 z(JY2O$|1ow`Ck^&mH}M|pSOQ_;vjQZXtfGF>5S7&U7gc^jA{M*Ly!@F@9ck#_?MR$ zA8v)La{nV7A3er&vootVX#MH+a}s;ykaW?vM+I$_Ceg|(>RZ)FpZzN z|3Ck+Cfe>1gJ0!XzW0A#0ZxqAXLzNiXVqDA*h?5t+0K(zxiNK~ogDYPFA?Hkh(6zc zVq|()GSMHb)|s}h`#pGY&~kVI)+S^RGQ{?G`nJ1PkZrJ&QB@+q!GHvGA^Tzv%it=U z|D7nyj>i3<&ku6l(2oA+A7S`Aoh^NlBv_f!P^xg-p}t~K;%e%1`TT8HN}!)%<36~o VbJ;o8^#c4+zNN14PR=ag{{qS|ao+#{ literal 0 HcmV?d00001 From 7caec10e7b978853f8f87fe1d0cf77aa85066cdb Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 16 Aug 2025 13:16:34 +0800 Subject: [PATCH 325/932] [XPU]avoid circular import during XPU init (#23017) Signed-off-by: Kunshang Ji --- vllm/platforms/xpu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 66ebc8ad9d..af24437f64 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Optional import torch import vllm.envs as envs -from vllm.config import CUDAGraphMode from vllm.logger import init_logger from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS @@ -105,6 +104,8 @@ class XPUPlatform(Platform): and not cls.device_support_bf16(): model_config.dtype = torch.float16 + # lazy import to avoid circular import + from vllm.config import CUDAGraphMode compilation_config = vllm_config.compilation_config if compilation_config.cudagraph_mode is None or \ compilation_config.cudagraph_mode.max_cudagraph_mode() \ From 5157827cfc0fd06d361897b2cc912ee1b5bc6277 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Sat, 16 Aug 2025 01:36:27 -0400 Subject: [PATCH 326/932] [Build] Env var to disable sccache (#22968) Signed-off-by: Lucas Wilkinson --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 919300e143..cc3037ebb7 100644 --- a/setup.py +++ b/setup.py @@ -60,7 +60,8 @@ MAIN_CUDA_VERSION = "12.8" def is_sccache_available() -> bool: - return which("sccache") is not None + return which("sccache") is not None and \ + not bool(int(os.getenv("VLLM_DISABLE_SCCACHE", "0"))) def is_ccache_available() -> bool: From 78863f8c5c67367f32533dd0230faae51ec51145 Mon Sep 17 00:00:00 2001 From: Andrew Sansom Date: Sat, 16 Aug 2025 01:25:10 -0500 Subject: [PATCH 327/932] [BugFix] Add support for loading prompt embeds tensors serialized on unavailable devices and sparse tensors (#22962) Signed-off-by: Andrew Sansom --- .../openai/test_prompt_validation.py | 49 +++++++++++++++++++ vllm/entrypoints/openai/serving_engine.py | 6 ++- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index e31a1d0776..4197583074 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -1,10 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import io + # imports for guided decoding tests import openai +import pybase64 import pytest import regex as re +import torch + +from vllm.entrypoints.openai.serving_engine import OpenAIServing from ...utils import RemoteOpenAIServer @@ -42,3 +48,46 @@ async def test_out_of_vocab_token_ids(): prompt=[999999], max_tokens=5, temperature=0.0) + + +@pytest.mark.parametrize("dtype", + [torch.float32, torch.bfloat16, torch.float16]) +@pytest.mark.parametrize( + "layout", + [torch.strided, torch.sparse_coo, torch.sparse_csc, torch.sparse_csr]) +@pytest.mark.parametrize("seq_len", [2, 10]) +@pytest.mark.parametrize("hidden_size", [2, 10]) +def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout, + seq_len: int, hidden_size: int): + # construct arbitrary tensors of various dtypes, layouts, and sizes. + # We need to check against different layouts to make sure that if a user + # uses sparse tensors to reduce the transmission size of prompt embeddings, + # we must cast them to dense/strided before passing them into the engine. + # We don't use non-CPU tensors in this test to avoid preemptively + # initializing cuda and break other tests in the suite that fork processes. + # We also need to make sure that we only use devices that are actually + # available in the environment the test is running on. For simplicity, + # we just test against CPU. + tensor = torch.randn((seq_len, hidden_size), dtype=dtype) + if layout == torch.strided: + tensor = tensor.contiguous() + elif layout == torch.sparse_coo: + tensor = tensor.to_sparse_coo() + elif layout == torch.sparse_csc: + tensor = tensor.to_sparse_csc() + elif layout == torch.sparse_csr: + tensor = tensor.to_sparse_csr() + + buffer = io.BytesIO() + torch.save(tensor, buffer) + buffer.seek(0) + encoded_tensor = pybase64.b64encode(buffer.getvalue()) + + loaded_prompt_embeds = OpenAIServing._load_prompt_embeds(encoded_tensor) + assert len(loaded_prompt_embeds) == 1 + loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"] + assert loaded_tensor.device.type == "cpu" + assert loaded_tensor.layout == torch.strided + torch.testing.assert_close(loaded_tensor, + tensor.to("cpu").to_dense(), + equal_nan=True) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index d6f92a6330..0f4a7c0186 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1006,8 +1006,8 @@ class OpenAIServing: # OPTIMIZATION priority = orig_priority - 1 + @staticmethod def _load_prompt_embeds( - self, prompt_embeds: Optional[Union[bytes, list[bytes]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None ) -> list[EmbedsPrompt]: @@ -1015,12 +1015,14 @@ class OpenAIServing: def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt: tensor = torch.load(io.BytesIO( pybase64.b64decode(embed, validate=True)), - weights_only=True) + weights_only=True, + map_location=torch.device("cpu")) assert isinstance(tensor, torch.Tensor) and tensor.dtype in ( torch.float32, torch.bfloat16, torch.float16, ) + tensor = tensor.to_dense() if tensor.dim() > 2: tensor = tensor.squeeze(0) assert tensor.dim() == 2 From 6d3da472bc8f202229a8e178671f4fe72037cfb1 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 16 Aug 2025 15:26:10 +0800 Subject: [PATCH 328/932] [Misc] Add --save-dir option to benchmark_moe (#23020) Signed-off-by: Jee Jee Li --- benchmarks/kernels/benchmark_moe.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 13bf1be836..b4a03665ef 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -3,6 +3,7 @@ import argparse import json +import os import time from contextlib import nullcontext from datetime import datetime @@ -542,6 +543,7 @@ def save_configs( use_fp8_w8a8: bool, use_int8_w8a16: bool, block_quant_shape: list[int], + save_dir: str, ) -> None: dtype_str = get_config_dtype_str( dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 @@ -552,7 +554,8 @@ def save_configs( filename = get_config_file_name( num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape ) - + os.makedirs(save_dir, exist_ok=True) + filename = os.path.join(save_dir, filename) print(f"Writing best config to {filename}...") with open(filename, "w") as f: json.dump(configs, f, indent=4) @@ -707,6 +710,7 @@ def main(args: argparse.Namespace): use_fp8_w8a8, use_int8_w8a16, block_quant_shape, + args.save_dir, ) end = time.time() print(f"Tuning took {end - start:.2f} seconds") @@ -748,6 +752,9 @@ if __name__ == "__main__": "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" ) parser.add_argument("--use-deep-gemm", action="store_true") + parser.add_argument( + "--save-dir", type=str, default="./", help="Directory to save tuned results" + ) parser.add_argument("--seed", type=int, default=0) parser.add_argument("--batch-size", type=int, nargs="+", required=False) parser.add_argument("--tune", action="store_true") From cc826a202b7b66af222374129573763237db3c1c Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 16 Aug 2025 15:44:50 +0800 Subject: [PATCH 329/932] [Multimodal] Update Tensor schema test to cover arbitrary shape mm inputs (#22867) Signed-off-by: Isotr0py --- tests/models/multimodal/test_tensor_schema.py | 143 +++++++++++++++--- vllm/model_executor/models/keye.py | 22 ++- 2 files changed, 138 insertions(+), 27 deletions(-) diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py index 92390d8c2f..036624431c 100644 --- a/tests/models/multimodal/test_tensor_schema.py +++ b/tests/models/multimodal/test_tensor_schema.py @@ -1,17 +1,26 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable from functools import partial +from typing import Any, Union from unittest.mock import patch +import numpy as np import pytest +from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk, + UserMessage) +from mistral_common.protocol.instruct.request import ChatCompletionRequest +from PIL import Image from vllm.config import ModelConfig from vllm.engine.llm_engine import LLMEngine as V0LLMEngine from vllm.inputs import InputProcessingContext -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, + MultiModalKwargs) from vllm.multimodal.processing import BaseMultiModalProcessor +from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config -from vllm.utils import GiB_bytes, set_default_torch_num_threads +from vllm.utils import GiB_bytes, is_list_of, set_default_torch_num_threads from vllm.v1.core.kv_cache_utils import get_kv_cache_config from vllm.v1.engine.core import EngineCore as V1EngineCore @@ -23,12 +32,64 @@ ARCH_TO_SKIP = { "MolmoForCausalLM": "incompatible requirements", "MiniMaxVL01ForConditionalGeneration": "broken model", } +ARCH_NEEDS_EXTRAS = [ + "InternVLChatModel", + "Idefics3ForConditionalGeneration", + "LlavaForConditionalGeneration", + "MiniCPMV", + "PaliGemmaForConditionalGeneration", +] +REPO_ID_TO_SKIP = {"nm-testing/pixtral-12b-FP8-dynamic": "duplicated test"} + +ImageInput = list[Image.Image] +VideoInput = Union[list[Image.Image], list[np.ndarray], + list[tuple[np.ndarray, dict[str, Any]]]] +AudioInput = list[tuple[np.ndarray, int]] + + +def _resize_data(_data: Union[Image.Image, np.ndarray], + size_factor: float) -> Union[Image.Image, np.ndarray]: + assert size_factor <= 1, "Size factor must be less than 1" + # Image input + if isinstance(_data, Image.Image): + W, H = _data.width, _data.height + W, H = map(lambda x: int(x * size_factor), (W, H)) + return _data.resize((W, H)) + # Video input with PIL Images + elif is_list_of(_data, Image.Image): + W, H = next(iter(_data)).width, next(iter(_data)).height + T = len(_data) + T, W, H = map(lambda x: max(int(x * size_factor), 1), (T, W, H)) + return [d.resize((W, H)) for d in _data[:T]] + # Video input with numpy arrays + elif isinstance(_data, np.ndarray) and _data.ndim >= 4: + T, H, W, C = _data.shape[-4:] + T, H, W = map(lambda x: max(int(x * size_factor), 1), (T, H, W)) + return _data[..., :T, :H, :W, :C] + # Audio input + elif isinstance(_data, np.ndarray) and _data.ndim == 1: + return _data[:int(len(_data) * size_factor)] + raise AssertionError("This line should be unreachable.") + + +def resize_mm_data( + data: Union[ImageInput, VideoInput, AudioInput], + size_factors: tuple[float, + ...]) -> Union[ImageInput, VideoInput, AudioInput]: + size_factors = size_factors[:len(data)] + if is_list_of(data, (Image.Image, np.ndarray, list)): + return [_resize_data(d, s) for d, s in zip(data, size_factors)] + elif is_list_of(data, tuple): + return [(_resize_data(d, s), meta) + for (d, meta), s in zip(data, size_factors)] + raise ValueError("Unsupported multimodal data type.") def create_batched_mm_kwargs( model_config: ModelConfig, processor: BaseMultiModalProcessor, -) -> MultiModalKwargs: + size_factors: tuple[float, ...] = (1.0, 0.5, 0.25), +) -> Iterable[tuple[str, int, BatchedTensorInputs]]: processing_info = processor.info dummy_inputs = processor.dummy_inputs supported_mm_limits = processing_info.get_supported_mm_limits() @@ -40,30 +101,69 @@ def create_batched_mm_kwargs( seq_len=model_config.max_model_len, mm_counts=mm_counts, ) + mm_data = processor_inputs.mm_data + resized_mm_data = { + modality: resize_mm_data(data, size_factors) + for modality, data in mm_data.items() + } + # Mistral chat outputs tokens directly, rather than text prompts + if model_config.tokenizer_mode == "mistral": + images = resized_mm_data.get("image", []) + request = ChatCompletionRequest(messages=[ + UserMessage(content=[ + TextChunk(text=""), + *(ImageChunk(image=image) for image in images), + ]), + ]) + tokenizer = processing_info.get_tokenizer() + res = tokenizer.mistral.encode_chat_completion(request) + prompt = res.tokens + else: + prompt = processor_inputs.prompt mm_kwargs = processor.apply( - prompt=processor_inputs.prompt, - mm_data=processor_inputs.mm_data, + prompt=prompt, + mm_data=resized_mm_data, hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, tokenization_kwargs=processor_inputs.tokenization_kwargs, )["mm_kwargs"] - mm_kwargs = MultiModalKwargs.batch([mm_kwargs]) - return mm_kwargs + items = [ + item for modality in supported_mm_limits + for item in mm_kwargs.get_items(modality) + ] + return group_mm_kwargs_by_modality(items) + + +def get_model_id_to_test( + model_arch_list: Iterable[str]) -> list[tuple[str, str]]: + filtered_results = [] + for model_arch in model_arch_list: + model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) + if model_info.extras and model_arch in ARCH_NEEDS_EXTRAS: + available_repos = list( + map(lambda model_id: (model_arch, model_id), + [model_info.default, *model_info.extras.values()])) + filtered_results.extend(available_repos) + else: + filtered_results.append((model_arch, model_info.default)) + return filtered_results @pytest.mark.core_model -@pytest.mark.parametrize("model_arch", list(_MULTIMODAL_EXAMPLE_MODELS.keys())) -def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner], - monkeypatch): +@pytest.mark.parametrize( + "model_arch, model_id", + get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys())) +def test_model_tensor_schema(model_arch: str, model_id: str, + vllm_runner: type[VllmRunner], monkeypatch): if model_arch in ARCH_TO_SKIP: pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}") + if model_id in REPO_ID_TO_SKIP: + pytest.skip(f"Skipping {model_id} due to {REPO_ID_TO_SKIP[model_id]}") model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip", check_max_version=False) - model_id = model_info.default - hf_overrides_fn = partial(dummy_hf_overrides, model_arch=model_arch, exist_overrides=model_info.hf_overrides) @@ -119,6 +219,7 @@ def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner], if model_info.v0_only: m.setenv("VLLM_USE_V1", "0") + # TODO(Isotr0py): Can we avoid initializing engine? with ( set_default_torch_num_threads(1), vllm_runner( @@ -145,12 +246,16 @@ def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner], mm_registry = llm_engine.input_preprocessor.mm_registry processor = mm_registry.create_processor(model_config) - mm_kwargs = create_batched_mm_kwargs(model_config, processor) - def validate_model_input(model): - for modality in ("audio", "image", "video"): - method_name = f"_parse_and_validate_{modality}_input" - if hasattr(model, method_name): - getattr(model, method_name)(**mm_kwargs) + def validate_model_input(model, modality: str, + mm_kwargs: MultiModalKwargs): + method_name = f"_parse_and_validate_{modality}_input" + if hasattr(model, method_name): + getattr(model, method_name)(**mm_kwargs) - vllm_model.apply_model(validate_model_input) \ No newline at end of file + for modality, _, mm_kwargs in create_batched_mm_kwargs( + model_config, processor): + valid_func = partial(validate_model_input, + modality=modality, + mm_kwargs=mm_kwargs) + vllm_model.apply_model(valid_func) diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 40c66c2268..db9ed5910d 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -30,7 +30,7 @@ from vllm.model_executor.layers.quantization.gptq_marlin import ( from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors from vllm.multimodal.inputs import (ImageItem, ModalityData, MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargs, VideoItem) @@ -44,6 +44,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope +from vllm.utils import is_list_of from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, @@ -112,8 +113,9 @@ class KeyeImagePixelInputs(TensorSchema): - g: Grid dimensions (3 for t, h, w) """ type: Literal["pixel_values"] - pixel_values: Annotated[torch.Tensor, - TensorShape("b", "np", 3, "ps", "ps")] + pixel_values: Annotated[ + torch.Tensor, + TensorShape("b", "np", 3, "ps", "ps", dynamic_dims={"np"})] image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)] @@ -145,8 +147,9 @@ class KeyeVideoPixelInputs(TensorSchema): - g: Grid dimensions (3 for t, h, w) """ type: Literal["pixel_values_videos"] - pixel_values_videos: Annotated[torch.Tensor, - TensorShape("b", "np", 3, "ps", "ps")] + pixel_values_videos: Annotated[ + torch.Tensor, + TensorShape("b", "np", 3, "ps", "ps", dynamic_dims={"np"})] video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)] @@ -1295,7 +1298,7 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, return None return quant_config - def _validate_and_reshape_mm_tensor(self, mm_input: object, + def _validate_and_reshape_mm_tensor(self, mm_input: NestedTensors, name: str) -> torch.Tensor: if not isinstance(mm_input, (torch.Tensor, list)): raise ValueError(f"Incorrect type of {name}. " @@ -1310,8 +1313,11 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, f"Got ndim: {mm_input.ndim} " f"(shape={mm_input.shape})") return torch.concat(list(mm_input)) - else: - return torch.concat(mm_input) + elif is_list_of(mm_input, torch.Tensor): + if all(p.dim() == 4 for p in mm_input) or all(p.dim() == 2 + for p in mm_input): + return mm_input + return torch.concat(list(mm_input)) def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[KeyeImageInputs]: From 933f45334a79dcb69aa93178b3bbf3d9e0d46f09 Mon Sep 17 00:00:00 2001 From: Chengji Yao Date: Sat, 16 Aug 2025 00:46:00 -0700 Subject: [PATCH 330/932] [Core] Make cudagraph check cuda platform only (#23005) Signed-off-by: Chengji Yao Signed-off-by: Chengji Yao Co-authored-by: Chengji Yao Co-authored-by: Li, Jiang --- vllm/config/__init__.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 280ae60c91..72fec5e205 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3535,15 +3535,6 @@ class VllmConfig: # in V0 means the compilation level wins out. self.compilation_config.level = CompilationLevel.NO_COMPILATION - # if cudagraph_mode is not explicitly set by users, set default value - if self.compilation_config.cudagraph_mode is None: - if envs.VLLM_USE_V1 and self.compilation_config.level \ - == CompilationLevel.PIECEWISE: - self.compilation_config.cudagraph_mode = \ - CUDAGraphMode.PIECEWISE - else: - self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE - # async tp is built on top of sequence parallelism # and requires it to be enabled. if self.compilation_config.pass_config.enable_async_tp: @@ -3552,14 +3543,28 @@ class VllmConfig: if self.compilation_config.pass_config.enable_sequence_parallelism: self.compilation_config.custom_ops.append("+rms_norm") - # disable cudagraph when enforce eager execution - if self.model_config is not None and self.model_config.enforce_eager: - logger.info("Cudagraph is disabled under eager mode") - self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE - elif envs.VLLM_USE_V1: - self.compilation_config.cudagraph_num_of_warmups = 1 + if current_platform.is_cuda_alike(): + # if cudagraph_mode is not explicitly set by users, set default + # value + if self.compilation_config.cudagraph_mode is None: + if envs.VLLM_USE_V1 and self.compilation_config.level \ + == CompilationLevel.PIECEWISE: + self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.PIECEWISE + else: + self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE - self._set_cudagraph_sizes() + # disable cudagraph when enforce eager execution + if self.model_config is not None and \ + self.model_config.enforce_eager: + logger.info("Cudagraph is disabled under eager mode") + self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE + elif envs.VLLM_USE_V1: + self.compilation_config.cudagraph_num_of_warmups = 1 + + self._set_cudagraph_sizes() + else: + self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE if self.cache_config.cpu_offload_gb > 0 and \ self.compilation_config.level != CompilationLevel.NO_COMPILATION \ @@ -3618,7 +3623,7 @@ class VllmConfig: current_platform.check_and_update_config(self) # final check of cudagraph mode after platform-specific update - if envs.VLLM_USE_V1: + if envs.VLLM_USE_V1 and current_platform.is_cuda_alike(): if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \ and self.model_config is not None and \ not self.model_config.disable_cascade_attn: From 2dbccce8a67e8004b365e7e533107c54c9542ce7 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 16 Aug 2025 17:44:19 +0800 Subject: [PATCH 331/932] [CI][Bugfix] Skip Ovis2 generation test because of broken remote code (#22954) Signed-off-by: Isotr0py --- tests/models/registry.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 3efc9a99ea..10e29e01e8 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -196,7 +196,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { {"alias": "gpt2"}), "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder", extras={"tiny": "bigcode/tiny_starcoder_py"}, # noqa: E501 - min_transformers_version="4.55.1"), + min_transformers_version="4.55.1", + transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501 "GPTJForCausalLM": _HfExamplesInfo("Milos/slovak-gpt-j-405M", {"6b": "EleutherAI/gpt-j-6b"}), "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m", @@ -408,14 +409,16 @@ _MULTIMODAL_EXAMPLE_MODELS = { extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501 max_transformers_version="4.48", # noqa: E501 transformers_version_reason="HF model is not compatible."), # noqa: E501 + "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 + {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}, # noqa: E501 + min_transformers_version="4.55.1", + transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501 "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", extras={"2B": "OpenGVLab/InternVL2-2B", "3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501 trust_remote_code=True), "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1", trust_remote_code=True), - "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 - {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501 "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501 trust_remote_code=True), "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501 @@ -455,6 +458,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Llama_Nemotron_Nano_VL" : _HfExamplesInfo("nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", # noqa: E501 trust_remote_code=True), "Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True, + max_transformers_version="4.53", + transformers_version_reason="HF model is not compatible", # noqa: E501 extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B", "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501 "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501 @@ -482,7 +487,9 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501 "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B", trust_remote_code=True), - "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501 + "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct", # noqa: E501 + min_transformers_version="4.55.1", + transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501 "Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3", trust_remote_code=True, is_available_online=False), From de9cb617637deabab4e34db05d26c8d4d6b2ed98 Mon Sep 17 00:00:00 2001 From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Date: Sat, 16 Aug 2025 03:21:20 -0700 Subject: [PATCH 332/932] Add docs for PrefixRepetitionDataset + enable usage with `vllm bench throughput` (#23012) Signed-off-by: Seiji Eicher Co-authored-by: Roger Wang --- benchmarks/README.md | 22 +++++++++++++- vllm/benchmarks/throughput.py | 57 ++++++++++++++++++++++++++++++++--- 2 files changed, 73 insertions(+), 6 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index caff8f0342..1d715a193e 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -40,7 +40,7 @@ become available. wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv - Sonnet + Sonnet (deprecated) ✅ ✅ Local file: benchmarks/sonnet.txt @@ -51,6 +51,12 @@ become available. ✅ synthetic + + Prefix Repetition + ✅ + ✅ + synthetic + HuggingFace-VisionArena ✅ @@ -592,6 +598,20 @@ python3 benchmarks/benchmark_prefix_caching.py \ --input-length-range 128:256 ``` +### Prefix Repetition Dataset + +```bash +vllm bench serve \ + --backend openai \ + --model meta-llama/Llama-2-7b-chat-hf \ + --dataset-name prefix_repetition \ + --num-prompts 100 \ + --prefix-repetition-prefix-len 512 \ + --prefix-repetition-suffix-len 128 \ + --prefix-repetition-num-prefixes 5 \ + --prefix-repetition-output-len 128 +``` + ## ⚡ Example - Request Prioritization Benchmark diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index fdf6548ada..0c19fa6dcf 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -18,9 +18,11 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset, ConversationDataset, - InstructCoderDataset, RandomDataset, - SampleRequest, ShareGPTDataset, - SonnetDataset, VisionArenaDataset) + InstructCoderDataset, + PrefixRepetitionRandomDataset, + RandomDataset, SampleRequest, + ShareGPTDataset, SonnetDataset, + VisionArenaDataset) from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, write_to_json) from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs @@ -327,6 +329,12 @@ def get_requests(args, tokenizer): dataset_cls = AIMODataset common_kwargs['dataset_subset'] = None common_kwargs['dataset_split'] = "train" + elif args.dataset_name == "prefix_repetition": + dataset_cls = PrefixRepetitionRandomDataset + sample_kwargs["prefix_len"] = args.prefix_repetition_prefix_len + sample_kwargs["suffix_len"] = args.prefix_repetition_suffix_len + sample_kwargs["num_prefixes"] = args.prefix_repetition_num_prefixes + sample_kwargs["output_len"] = args.prefix_repetition_output_len else: raise ValueError(f"Unknown dataset name: {args.dataset_name}") # Remove None values @@ -356,7 +364,11 @@ def validate_args(args): raise ValueError(f"Unsupported backend: {args.backend}") # === Dataset Configuration === - if not args.dataset and not args.dataset_path: + if ( + not args.dataset + and not args.dataset_path + and args.dataset_name not in {"prefix_repetition"} + ): print( "When dataset path is not set, it will default to random dataset") args.dataset_name = 'random' @@ -432,7 +444,10 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--dataset-name", type=str, - choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"], + choices=[ + "sharegpt", "random", "sonnet", "burstgpt", "hf", + "prefix_repetition" + ], help="Name of the dataset to benchmark on.", default="sharegpt") parser.add_argument( @@ -521,6 +536,38 @@ def add_cli_args(parser: argparse.ArgumentParser): default=None, help="Split of the HF dataset.") + # prefix repetition dataset + prefix_repetition_group = parser.add_argument_group( + "prefix repetition dataset options") + prefix_repetition_group.add_argument( + "--prefix-repetition-prefix-len", + type=int, + default=None, + help="Number of prefix tokens per request, used only for prefix " + "repetition dataset.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-suffix-len", + type=int, + default=None, + help="Number of suffix tokens per request, used only for prefix " + "repetition dataset. Total input length is prefix_len + suffix_len.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-num-prefixes", + type=int, + default=None, + help="Number of prefixes to generate, used only for prefix repetition " + "dataset. Prompts per prefix is num_requests // num_prefixes.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-output-len", + type=int, + default=None, + help="Number of output tokens per request, used only for prefix " + "repetition dataset.", + ) + parser = AsyncEngineArgs.add_cli_args(parser) From 4dff91c93da668f4cca3f80aa3a94622d21c34fc Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 16 Aug 2025 19:30:49 +0800 Subject: [PATCH 333/932] [Refactor] Allow optional MultiModalKwargsItem in IPC (#23022) Signed-off-by: DarkLight1337 --- tests/v1/core/test_kv_cache_utils.py | 12 +----- tests/v1/core/test_prefix_caching.py | 12 +----- tests/v1/core/test_scheduler.py | 12 +----- tests/v1/core/utils.py | 12 +----- vllm/multimodal/inputs.py | 62 ++++++++-------------------- vllm/v1/engine/__init__.py | 3 +- vllm/v1/engine/mm_input_cache.py | 33 ++++++++------- vllm/v1/engine/processor.py | 10 +++-- vllm/v1/request.py | 7 +++- vllm/v1/worker/gpu_model_runner.py | 4 +- 10 files changed, 59 insertions(+), 108 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index e0b91e6dd7..47c74aff1e 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -7,9 +7,7 @@ import pytest import torch from vllm.config import ModelConfig, SchedulerConfig, VllmConfig -from vllm.multimodal.inputs import (MultiModalBatchedField, - MultiModalFieldElem, MultiModalKwargsItem, - PlaceholderRange) +from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange from vllm.sampling_params import SamplingParams from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit from vllm.v1.core.kv_cache_manager import KVCacheManager @@ -42,13 +40,7 @@ def make_request( if mm_positions is None: mm_kwargs = None else: - mm_elem = MultiModalFieldElem( - modality="dummy_m", - key="dummy_k", - data=None, - field=MultiModalBatchedField(), - ) - mm_item = MultiModalKwargsItem.from_elems([mm_elem]) + mm_item = MultiModalKwargsItem.dummy("dummy_m") mm_kwargs = [mm_item] * len(mm_positions) return Request(request_id=request_id, diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 28cfca6767..89824768ed 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -9,9 +9,7 @@ import pytest import torch from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved -from vllm.multimodal.inputs import (MultiModalBatchedField, - MultiModalFieldElem, MultiModalKwargsItem, - PlaceholderRange) +from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange from vllm.sampling_params import SamplingParams from vllm.utils import sha256, sha256_cbor_64bit from vllm.v1.core.block_pool import BlockPool @@ -37,13 +35,7 @@ def make_request( if mm_positions is None: mm_kwargs = None else: - mm_elem = MultiModalFieldElem( - modality="dummy_m", - key="dummy_k", - data=None, - field=MultiModalBatchedField(), - ) - mm_item = MultiModalKwargsItem.from_elems([mm_elem]) + mm_item = MultiModalKwargsItem.dummy("dummy_m") mm_kwargs = [mm_item] * len(mm_positions) return Request(request_id=request_id, diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index ac70c90d92..23762a0fb6 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -8,9 +8,7 @@ import torch from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, SchedulerConfig, SpeculativeConfig, VllmConfig) -from vllm.multimodal.inputs import (MultiModalBatchedField, - MultiModalFieldElem, MultiModalKwargsItem, - PlaceholderRange) +from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange from vllm.sampling_params import GuidedDecodingParams, SamplingParams from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput from vllm.v1.core.sched.scheduler import Scheduler @@ -1328,13 +1326,7 @@ def create_requests_with_priority( for i in range(num_requests): if mm_positions is not None: mm_position = mm_positions[i] - mm_elem = MultiModalFieldElem( - modality="dummy_m", - key="dummy_k", - data=None, - field=MultiModalBatchedField(), - ) - mm_item = MultiModalKwargsItem.from_elems([mm_elem]) + mm_item = MultiModalKwargsItem.dummy("dummy_m") mm_kwargs = [mm_item] * len(mm_position) else: mm_position = None diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 52093d3d38..849c3f59ae 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -6,9 +6,7 @@ import torch from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, SchedulerConfig, SpeculativeConfig, VllmConfig) -from vllm.multimodal.inputs import (MultiModalBatchedField, - MultiModalFieldElem, MultiModalKwargsItem, - PlaceholderRange) +from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange from vllm.sampling_params import SamplingParams from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, init_none_hash) @@ -143,13 +141,7 @@ def create_requests( for i in range(num_requests): if mm_positions is not None: mm_position = mm_positions[i] - mm_elem = MultiModalFieldElem( - modality="dummy_m", - key="dummy_k", - data=None, - field=MultiModalBatchedField(), - ) - mm_item = MultiModalKwargsItem.from_elems([mm_elem]) + mm_item = MultiModalKwargsItem.dummy("dummy_m") mm_kwargs = [mm_item] * len(mm_position) mm_hashes = ["hash"] * len(mm_position) else: diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 0bbac45c12..a33ce14699 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod from collections import UserDict, defaultdict from collections.abc import Mapping, Sequence -from dataclasses import dataclass, replace +from dataclasses import dataclass from functools import partial from itertools import accumulate from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar, @@ -218,7 +218,7 @@ class MultiModalFieldElem: i.e. the name of the keyword argument to be passed to the model. """ - data: Optional[NestedTensors] + data: NestedTensors """ The tensor data of this field in [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs], @@ -315,13 +315,8 @@ class BaseMultiModalField(ABC): if len(set(field_types)) > 1: raise ValueError(f"Cannot merge different {field_types=}") - validated_data = list[NestedTensors]() - for i, elem in enumerate(elems): - assert elem.data is not None, ( - f"Cannot merge with empty `elems[{i}]`") - validated_data.append(elem.data) - - return self._reduce_data(validated_data, pin_memory=pin_memory) + batch = [elem.data for elem in elems] + return self._reduce_data(batch, pin_memory=pin_memory) @dataclass(frozen=True) @@ -643,6 +638,17 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems]. """ + @staticmethod + def dummy(modality: str): + """Convenience class for testing.""" + mm_elem = MultiModalFieldElem( + modality=modality, + key="dummy", + data=torch.empty(1), + field=MultiModalSharedField(1), + ) + return MultiModalKwargsItem.from_elems([mm_elem]) + @staticmethod def from_elems(elems: Sequence[MultiModalFieldElem]): return MultiModalKwargsItem({elem.key: elem for elem in elems}) @@ -654,46 +660,12 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): assert len(modalities) == 1, f"Found different modalities={modalities}" self._modality = next(iter(modalities)) - self._is_empty = any(elem.data is None for elem in self.values()) - @property def modality(self) -> str: return self._modality - @property - def is_empty(self) -> bool: - return self._is_empty - - def get_data(self) -> Optional[Mapping[str, NestedTensors]]: - if self._is_empty: - return None - - out_data = dict[str, NestedTensors]() - for key, elem in self.items(): - assert elem.data is not None, ( - f"Cannot get data of empty `elem[{key!r}]`") - out_data[key] = elem.data - - return out_data - - def require_data(self) -> Mapping[str, NestedTensors]: - if (data := self.get_data()) is None: - raise RuntimeError("Cannot get data of empty item") - - return data - - # These methods create a new item to avoid mutating cached items in place - def with_data(self, data: Mapping[str, NestedTensors]): - return MultiModalKwargsItem({ - key: replace(elem, data=data[key]) - for key, elem in self.items() - }) - - def without_data(self): - return MultiModalKwargsItem({ - key: replace(elem, data=None) - for key, elem in self.items() - }) + def get_data(self) -> Mapping[str, NestedTensors]: + return {key: elem.data for key, elem in self.items()} # NOTE: UserDict is for V0 compatibility. diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index b29394f3e6..f7ec982db4 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -3,6 +3,7 @@ import enum import time +from collections.abc import Sequence from typing import Any, Optional, Union import msgspec @@ -47,7 +48,7 @@ class EngineCoreRequest( request_id: str prompt_token_ids: list[int] - mm_kwargs: Optional[list[MultiModalKwargsItem]] + mm_kwargs: Optional[Sequence[Optional[MultiModalKwargsItem]]] mm_hashes: Optional[list[str]] mm_placeholders: Optional[list[PlaceholderRange]] sampling_params: Optional[SamplingParams] diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index 1fed74330f..aa7dc62fd4 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -1,11 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Mapping -from typing import TYPE_CHECKING +from collections.abc import Sequence +from typing import TYPE_CHECKING, Optional from vllm.multimodal import MultiModalRegistry from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata -from vllm.multimodal.inputs import MultiModalKwargsItem, NestedTensors +from vllm.multimodal.inputs import MultiModalKwargsItem +from vllm.utils import is_list_of if TYPE_CHECKING: from vllm.config import ModelConfig @@ -58,21 +59,21 @@ class MultiModalInputCacheClient: def get_and_update( self, - mm_kwargs: list[MultiModalKwargsItem], + mm_kwargs: Sequence[MultiModalKwargsItem], mm_hashes: list[str], - ) -> list[MultiModalKwargsItem]: + ) -> list[Optional[MultiModalKwargsItem]]: if not self.enabled: - return mm_kwargs + return list(mm_kwargs) assert len(mm_kwargs) == len(mm_hashes) - out_mm_items = list[MultiModalKwargsItem]() + out_mm_items = list[Optional[MultiModalKwargsItem]]() for mm_item, mm_hash in zip(mm_kwargs, mm_hashes): if self.mm_cache.get(mm_hash) is not None: - out_mm_items.append(mm_item.without_data()) + out_mm_items.append(None) else: self.mm_cache[mm_hash] = \ - MultiModalCacheItemMetadata.wraps(mm_item.require_data()) + MultiModalCacheItemMetadata.wraps(mm_item) out_mm_items.append(mm_item) return out_mm_items @@ -91,25 +92,27 @@ class MultiModalInputCacheServer: self.enabled = mm_registry.enable_mm_input_cache(model_config) self.mm_cache = MultiModalCache.get_lru_cache( model_config.get_mm_input_cache_gb(), - Mapping[str, NestedTensors], + MultiModalKwargsItem, ) def get_and_update( self, - mm_kwargs: list[MultiModalKwargsItem], + mm_kwargs: Sequence[Optional[MultiModalKwargsItem]], mm_hashes: list[str], ) -> list[MultiModalKwargsItem]: if not self.enabled: - return mm_kwargs + mm_kwargs_lst = list(mm_kwargs) + assert is_list_of(mm_kwargs_lst, MultiModalKwargsItem) + return mm_kwargs_lst assert len(mm_kwargs) == len(mm_hashes) out_mm_items = list[MultiModalKwargsItem]() for mm_item, mm_hash in zip(mm_kwargs, mm_hashes): - if (mm_data := mm_item.get_data()) is None: - out_mm_items.append(mm_item.with_data(self.mm_cache[mm_hash])) + if mm_item is None: + out_mm_items.append(self.mm_cache[mm_hash]) else: - self.mm_cache[mm_hash] = mm_data + self.mm_cache[mm_hash] = mm_item out_mm_items.append(mm_item) return out_mm_items diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 376c76a7e7..c6a23cdbf6 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -17,6 +17,7 @@ from vllm.multimodal.utils import argsort_mm_positions from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import TokenizerGroup +from vllm.utils import is_list_of from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient from vllm.v1.structured_output.backend_guidance import ( @@ -295,7 +296,7 @@ class Processor: pooling_params = params.clone() # Multimodal related. - sorted_mm_inputs: Optional[list[MultiModalKwargsItem]] = None + sorted_mm_inputs: Optional[list[Optional[MultiModalKwargsItem]]] = None sorted_mm_positions: Optional[list[PlaceholderRange]] = None sorted_mm_hashes: Optional[list[str]] = None if decoder_inputs["type"] == "multimodal": @@ -308,7 +309,7 @@ class Processor: # in the input sequence. sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions) - sorted_mm_inputs = [ + orig_sorted_mm_inputs = [ decoder_mm_inputs.get_item(modality, idx) for modality, idx in sorted_mm_idxs ] @@ -323,9 +324,12 @@ class Processor: if sorted_mm_hashes is not None: sorted_mm_inputs = self.mm_input_cache_client.get_and_update( - sorted_mm_inputs, + orig_sorted_mm_inputs, sorted_mm_hashes, ) + else: + assert is_list_of(orig_sorted_mm_inputs, MultiModalKwargsItem) + sorted_mm_inputs = orig_sorted_mm_inputs return decoder_inputs.get("prompt"), EngineCoreRequest( request_id=request_id, diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 562925bde6..8b703b6191 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -125,14 +125,17 @@ class Request: block_hasher: Optional[Callable[["Request"], list["BlockHash"]]] ) -> "Request": if request.mm_kwargs is not None: - assert is_list_of(request.mm_kwargs, MultiModalKwargsItem), ( + mm_kwargs_lst = list(request.mm_kwargs) + assert is_list_of(mm_kwargs_lst, MultiModalKwargsItem), ( "mm_kwargs was not updated in EngineCore.add_request") + else: + mm_kwargs_lst = None return cls( request_id=request.request_id, client_index=request.client_index, prompt_token_ids=request.prompt_token_ids, - multi_modal_kwargs=request.mm_kwargs, + multi_modal_kwargs=mm_kwargs_lst, multi_modal_hashes=request.mm_hashes, multi_modal_placeholders=request.mm_placeholders, sampling_params=request.sampling_params, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4c919b392f..5ee44a8257 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -500,8 +500,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): second_per_grid_ts = [] audio_feature_lengths = [] use_audio_in_video = False - for item in self.requests[req_id].mm_kwargs: - mm_input = item.require_data() + for mm_item in self.requests[req_id].mm_kwargs: + mm_input = mm_item.get_data() if mm_input.get("image_grid_thw") is not None: image_grid_thw.append( mm_input["image_grid_thw"].tolist()) From 829bbd7882222c85c0ca5a17fbb2f70e543f50ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Sat, 16 Aug 2025 20:16:58 +0800 Subject: [PATCH 334/932] [New Model]mBART model (#22883) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪志鹏 --- docs/models/supported_models.md | 4 + examples/offline_inference/encoder_decoder.py | 235 +++++---- .../models/language/generation/test_mbart.py | 123 +++++ tests/models/registry.py | 2 + vllm/model_executor/models/bart.py | 444 +++++++++++++++++- vllm/model_executor/models/registry.py | 1 + 6 files changed, 717 insertions(+), 92 deletions(-) create mode 100644 tests/models/language/generation/test_mbart.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index a24fa4bcce..a514572945 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -330,6 +330,7 @@ th { | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ | | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | | | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | | +| `MBartForConditionalGeneration` | mBART | `facebook/mbart-large-en-ro`, `facebook/mbart-large-50`, etc. | | | | | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ | | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ | | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ | @@ -418,6 +419,9 @@ Some models are supported only via the [Transformers backend](#transformers). Th !!! note Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. +!!! note + Some mBART models' config files do not have an `architecture` defined. Therefore, you need to use `--hf-overrides '{"architectures": ["MBartForConditionalGeneration"]}'` to explicitly specify the use of the `MBartForConditionalGeneration` architecture. + ### Pooling Models See [this page](./pooling_models.md) for more information on how to use pooling models. diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py index 0da6fa5c4a..df6c1eaf4a 100644 --- a/examples/offline_inference/encoder_decoder.py +++ b/examples/offline_inference/encoder_decoder.py @@ -2,9 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Demonstrate prompting of text-to-text -encoder/decoder models, specifically BART +encoder/decoder models, specifically BART and mBART. + +This script is refactored to allow model selection via command-line arguments. """ +import argparse +from typing import NamedTuple, Optional + from vllm import LLM, SamplingParams from vllm.inputs import ( ExplicitEncoderDecoderPrompt, @@ -14,119 +19,175 @@ from vllm.inputs import ( ) -def create_prompts(tokenizer): - # Test prompts - # - # This section shows all of the valid ways to prompt an - # encoder/decoder model. - # - # - Helpers for building prompts - text_prompt_raw = "Hello, my name is" - text_prompt = TextPrompt(prompt="The president of the United States is") +class ModelRequestData(NamedTuple): + """ + Holds the configuration for a specific model, including its + HuggingFace ID and the prompts to use for the demo. + """ + + model_id: str + encoder_prompts: list + decoder_prompts: list + hf_overrides: Optional[dict] = None + + +def get_bart_config() -> ModelRequestData: + """ + Returns the configuration for facebook/bart-large-cnn. + This uses the exact test cases from the original script. + """ + encoder_prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "An encoder prompt", + ] + decoder_prompts = [ + "A decoder prompt", + "Another decoder prompt", + ] + return ModelRequestData( + model_id="facebook/bart-large-cnn", + encoder_prompts=encoder_prompts, + decoder_prompts=decoder_prompts, + ) + + +def get_mbart_config() -> ModelRequestData: + """ + Returns the configuration for facebook/mbart-large-en-ro. + This uses prompts suitable for an English-to-Romanian translation task. + """ + encoder_prompts = [ + "The quick brown fox jumps over the lazy dog.", + "How are you today?", + ] + decoder_prompts = ["", ""] + hf_overrides = {"architectures": ["MBartForConditionalGeneration"]} + return ModelRequestData( + model_id="facebook/mbart-large-en-ro", + encoder_prompts=encoder_prompts, + decoder_prompts=decoder_prompts, + hf_overrides=hf_overrides, + ) + + +MODEL_GETTERS = { + "bart": get_bart_config, + "mbart": get_mbart_config, +} + + +def create_all_prompt_types( + encoder_prompts_raw: list, + decoder_prompts_raw: list, + tokenizer, +) -> list: + """ + Generates a list of diverse prompt types for demonstration. + This function is generic and uses the provided raw prompts + to create various vLLM input objects. + """ + text_prompt_raw = encoder_prompts_raw[0] + text_prompt = TextPrompt(prompt=encoder_prompts_raw[1 % len(encoder_prompts_raw)]) tokens_prompt = TokensPrompt( - prompt_token_ids=tokenizer.encode(prompt="The capital of France is") - ) - # - Pass a single prompt to encoder/decoder model - # (implicitly encoder input prompt); - # decoder input prompt is assumed to be None - - single_text_prompt_raw = text_prompt_raw # Pass a string directly - single_text_prompt = text_prompt # Pass a TextPrompt - single_tokens_prompt = tokens_prompt # Pass a TokensPrompt - - # ruff: noqa: E501 - # - Pass explicit encoder and decoder input prompts within one data structure. - # Encoder and decoder prompts can both independently be text or tokens, with - # no requirement that they be the same prompt type. Some example prompt-type - # combinations are shown below, note that these are not exhaustive. - - enc_dec_prompt1 = ExplicitEncoderDecoderPrompt( - # Pass encoder prompt string directly, & - # pass decoder prompt tokens - encoder_prompt=single_text_prompt_raw, - decoder_prompt=single_tokens_prompt, - ) - enc_dec_prompt2 = ExplicitEncoderDecoderPrompt( - # Pass TextPrompt to encoder, and - # pass decoder prompt string directly - encoder_prompt=single_text_prompt, - decoder_prompt=single_text_prompt_raw, - ) - enc_dec_prompt3 = ExplicitEncoderDecoderPrompt( - # Pass encoder prompt tokens directly, and - # pass TextPrompt to decoder - encoder_prompt=single_tokens_prompt, - decoder_prompt=single_text_prompt, + prompt_token_ids=tokenizer.encode( + encoder_prompts_raw[2 % len(encoder_prompts_raw)] + ) ) - # - Finally, here's a useful helper function for zipping encoder and - # decoder prompts together into a list of ExplicitEncoderDecoderPrompt - # instances + decoder_tokens_prompt = TokensPrompt( + prompt_token_ids=tokenizer.encode(decoder_prompts_raw[0]) + ) + single_prompt_examples = [ + text_prompt_raw, + text_prompt, + tokens_prompt, + ] + explicit_pair_examples = [ + ExplicitEncoderDecoderPrompt( + encoder_prompt=text_prompt_raw, + decoder_prompt=decoder_tokens_prompt, + ), + ExplicitEncoderDecoderPrompt( + encoder_prompt=text_prompt, + decoder_prompt=decoder_prompts_raw[1 % len(decoder_prompts_raw)], + ), + ExplicitEncoderDecoderPrompt( + encoder_prompt=tokens_prompt, + decoder_prompt=text_prompt, + ), + ] zipped_prompt_list = zip_enc_dec_prompts( - ["An encoder prompt", "Another encoder prompt"], - ["A decoder prompt", "Another decoder prompt"], + encoder_prompts_raw, + decoder_prompts_raw, ) - - # - Let's put all of the above example prompts together into one list - # which we will pass to the encoder/decoder LLM. - return [ - single_text_prompt_raw, - single_text_prompt, - single_tokens_prompt, - enc_dec_prompt1, - enc_dec_prompt2, - enc_dec_prompt3, - ] + zipped_prompt_list + return single_prompt_examples + explicit_pair_examples + zipped_prompt_list -# Create a sampling params object. -def create_sampling_params(): +def create_sampling_params() -> SamplingParams: + """Create a sampling params object.""" return SamplingParams( temperature=0, top_p=1.0, min_tokens=0, - max_tokens=20, + max_tokens=30, ) -# Print the outputs. -def print_outputs(outputs): - print("-" * 50) +def print_outputs(outputs: list): + """Formats and prints the generation outputs.""" + print("-" * 80) for i, output in enumerate(outputs): prompt = output.prompt encoder_prompt = output.encoder_prompt generated_text = output.outputs[0].text print(f"Output {i + 1}:") - print( - f"Encoder prompt: {encoder_prompt!r}\n" - f"Decoder prompt: {prompt!r}\n" - f"Generated text: {generated_text!r}" + print(f"Encoder Prompt: {encoder_prompt!r}") + print(f"Decoder Prompt: {prompt!r}") + print(f"Generated Text: {generated_text!r}") + print("-" * 80) + + +def main(args): + """Main execution function.""" + model_key = args.model + if model_key not in MODEL_GETTERS: + raise ValueError( + f"Unknown model: {model_key}. " + f"Available models: {list(MODEL_GETTERS.keys())}" ) - print("-" * 50) + config_getter = MODEL_GETTERS[model_key] + model_config = config_getter() - -def main(): - dtype = "float" - - # Create a BART encoder/decoder model instance + print(f"🚀 Running demo for model: {model_config.model_id}") llm = LLM( - model="facebook/bart-large-cnn", - dtype=dtype, + model=model_config.model_id, + dtype="float", + hf_overrides=model_config.hf_overrides, ) - - # Get BART tokenizer tokenizer = llm.llm_engine.get_tokenizer_group() - - prompts = create_prompts(tokenizer) + prompts = create_all_prompt_types( + encoder_prompts_raw=model_config.encoder_prompts, + decoder_prompts_raw=model_config.decoder_prompts, + tokenizer=tokenizer, + ) sampling_params = create_sampling_params() - - # Generate output tokens from the prompts. The output is a list of - # RequestOutput objects that contain the prompt, generated - # text, and other information. outputs = llm.generate(prompts, sampling_params) - print_outputs(outputs) if __name__ == "__main__": - main() + parser = argparse.ArgumentParser( + description="A flexible demo for vLLM encoder-decoder models." + ) + parser.add_argument( + "--model", + "-m", + type=str, + default="bart", + choices=MODEL_GETTERS.keys(), + help="The short name of the model to run.", + ) + args = parser.parse_args() + main(args) diff --git a/tests/models/language/generation/test_mbart.py b/tests/models/language/generation/test_mbart.py new file mode 100644 index 0000000000..854a727139 --- /dev/null +++ b/tests/models/language/generation/test_mbart.py @@ -0,0 +1,123 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + +import pytest +from transformers import AutoModelForSeq2SeqLM + +from vllm.sequence import SampleLogprobs + +from ....conftest import DecoderPromptType, HfRunner, VllmRunner +from ...utils import check_logprobs_close + + +def vllm_to_hf_output( + vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], + decoder_prompt_type: DecoderPromptType, +): + """Sanitize vllm output to be comparable with hf output.""" + output_ids, output_str, out_logprobs = vllm_output + hf_output_str = output_str + "" + return output_ids, hf_output_str, out_logprobs + + +def run_test( + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + prompts: list[dict[str, str]], + decoder_prompt_type: DecoderPromptType, + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +) -> None: + ''' + Test the vLLM mBART model by validating it against HuggingFace (HF). + (Docstring content is omitted for brevity) + ''' + + vllm_prompts = prompts + if decoder_prompt_type == DecoderPromptType.NONE: + vllm_prompts = [{ + "encoder_prompt": p['encoder_prompt'], + "decoder_prompt": "" + } for p in prompts] + + vllm_kwargs = { + "hf_overrides": { + "architectures": ["MBartForConditionalGeneration"] + } + } + + with vllm_runner(model, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True, + **vllm_kwargs) as vllm_model: # type: ignore + vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( + vllm_prompts, max_tokens, num_logprobs) + + hf_kwargs = { + "top_k": None, + "num_beams": 1, + "repetition_penalty": 1.0, + "top_p": 1.0, + "length_penalty": 1.0, + "early_stopping": False, + "no_repeat_ngram_size": None, + "min_length": 0 + } + + with hf_runner(model, dtype=dtype, + auto_cls=AutoModelForSeq2SeqLM) as hf_model: + hf_kwargs["decoder_start_token_id"] = ( + hf_model.tokenizer.lang_code_to_id["ro_RO"]) + + hf_outputs = ( + hf_model.generate_encoder_decoder_greedy_logprobs_limit( + prompts, # HF runner still uses the original prompts + max_tokens, + num_logprobs, + **hf_kwargs, + )) + + hf_skip_tokens = 0 + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, decoder_prompt_type) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + num_outputs_0_skip_tokens=hf_skip_tokens, + ) + + +@pytest.mark.parametrize( + "model", + [pytest.param("facebook/mbart-large-en-ro")], +) +@pytest.mark.parametrize("dtype", ["float", "bfloat16"]) +@pytest.mark.parametrize("max_tokens", [64]) +@pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) +def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model, + dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None: + + run_test( + hf_runner, + vllm_runner, + example_encoder_decoder_prompts[decoder_prompt_type], + decoder_prompt_type, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) diff --git a/tests/models/registry.py b/tests/models/registry.py index 10e29e01e8..99cf997790 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -316,6 +316,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { # [Encoder-decoder] "BartModel": _HfExamplesInfo("facebook/bart-base"), "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"), + "MBartForConditionalGeneration": _HfExamplesInfo("facebook/mbart-large-en-ro", # noqa: E501 + hf_overrides={"architectures": ["MBartForConditionalGeneration"]}), # noqa: E501 } _EMBEDDING_EXAMPLE_MODELS = { diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 3d328c88ff..32551d8102 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -46,7 +46,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsQuant, SupportsV0Only -from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix +from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors, + maybe_prefix) logger = logging.get_logger(__name__) @@ -422,10 +423,7 @@ class BartEncoderLayer(nn.Module): if hidden_states.dtype == torch.float16 and ( torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, - min=-clamp_value, - max=clamp_value) + hidden_states = cast_overflow_tensors(hidden_states) return hidden_states @@ -906,3 +904,439 @@ class BartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant): }) return loaded_params + + +class MBartEncoderLayer(BartEncoderLayer): + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + r""" + Args: + hidden_states + torch.Tensor of *encoder* input embeddings. + Returns: + Encoder layer output torch.Tensor + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states = self.self_attn(hidden_states=hidden_states) + + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + fc1_out, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(fc1_out) + + hidden_states, _ = self.fc2(hidden_states) + + hidden_states = residual + hidden_states + + if hidden_states.dtype == torch.float16 and ( + torch.isinf(hidden_states).any() + or torch.isnan(hidden_states).any()): + hidden_states = cast_overflow_tensors(hidden_states) + + return hidden_states + + +class MBartDecoderLayer(BartDecoderLayer): + + def forward( + self, + decoder_hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + residual = decoder_hidden_states + hidden_states = self.self_attn_layer_norm(decoder_hidden_states) + + # Self Attention + hidden_states = self.self_attn(hidden_states=hidden_states) + + hidden_states = residual + hidden_states + + # Cross-Attention Block + + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + hidden_states = self.encoder_attn( + decoder_hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + ) + + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + fc1_out, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(fc1_out) + + hidden_states, _ = self.fc2(hidden_states) + + hidden_states = residual + hidden_states + + return hidden_states + + +class MBartEncoder(nn.Module): + """ + Transformer encoder consisting of *config.encoder_layers* + self attention layers. Each layer is a [`BartEncoderLayer`]. + Args: + config: BartConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, + config: BartConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + embed_tokens: Optional[nn.Embedding] = None, + prefix: str = ""): + super().__init__() + + self.cache_config = cache_config + self.quant_config = quant_config + self.lora_config = lora_config + embed_dim = config.d_model + self.max_source_positions = config.max_position_embeddings + embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 + + self.embed_tokens = BartScaledWordEmbedding(config.vocab_size, + embed_dim, + embed_scale=embed_scale) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = BartLearnedPositionalEmbedding( + config.max_position_embeddings, + embed_dim, + ) + self.layers = nn.ModuleList([ + MBartEncoderLayer(config, + cache_config, + quant_config, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(config.encoder_layers) + ]) + + self.layernorm_embedding = nn.LayerNorm(embed_dim) + self.layer_norm = nn.LayerNorm(config.d_model) # 改动 + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + r""" + Args: + input_ids + Indices of *encoder* input sequence tokens in the vocabulary. + Padding will be ignored by default should you + provide it. + positions + Positions of *encoder* input sequence tokens. + Returns: + Decoder output torch.Tensor + """ + # retrieve input_ids and inputs_embeds + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + embed_pos = self.embed_positions(positions) + embed_pos = embed_pos.to(inputs_embeds.device) + + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + + for encoder_layer in self.layers: + hidden_states = encoder_layer(hidden_states=hidden_states) + + hidden_states = self.layer_norm(hidden_states) + return hidden_states + + +class MBartDecoder(nn.Module): + """ + Transformer decoder consisting of *config.decoder_layers* layers. + Each layer is a [`BartDecoderLayer`] + Args: + config: BartConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__( + self, + config: BartConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + embed_tokens: Optional[nn.Embedding] = None, + prefix: str = "", + ): + super().__init__() + self.cache_config = cache_config + self.quant_config = quant_config + self.lora_config = lora_config + self.max_target_positions = config.max_position_embeddings + embed_scale = math.sqrt( + config.d_model) if config.scale_embedding else 1.0 + + self.embed_tokens = BartScaledWordEmbedding(config.vocab_size, + config.d_model, + embed_scale=embed_scale) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = BartLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + ) + + self.layers = nn.ModuleList( + [MBartDecoderLayer(config, cache_config, quant_config, + prefix=f"{prefix}.layers.{layer_idx}") \ + for layer_idx in range(config.decoder_layers)]) + + self.layernorm_embedding = nn.LayerNorm(config.d_model) + self.layer_norm = nn.LayerNorm(config.d_model) + + def forward( + self, + decoder_input_ids: torch.Tensor, + decoder_positions: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor], + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + r""" + Args: + decoder_input_ids + Indices of *decoder* input sequence tokens in the vocabulary. + Padding will be ignored by default should you + provide it. + decoder_positions + Positions of *decoder* input sequence tokens. + encoder_hidden_states: + Tensor of encoder output embeddings + Returns: + Decoder output torch.Tensor + """ + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(decoder_input_ids) + else: + decoder_positions = inputs_embeds[:, -1] + + # embed positions + embed_pos = self.embed_positions(decoder_positions) + embed_pos = embed_pos.to(inputs_embeds.device) + + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + + # decoder layers + + for decoder_layer in self.layers: + hidden_states = decoder_layer( + decoder_hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + ) + + hidden_states = self.layer_norm(hidden_states) + return hidden_states + + +class MBartModel(nn.Module, SupportsQuant): + _tied_weights_keys = [ + "encoder.embed_tokens.weight", "decoder.embed_tokens.weight" + ] + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + + lora_vocab = (lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0 + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size + + self.encoder = MBartEncoder(config, + cache_config, + quant_config=quant_config, + prefix=f"{prefix}.encoder") + self.decoder = MBartDecoder(config, + cache_config, + quant_config=quant_config, + prefix=f"{prefix}.decoder") + + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, + encoder_input_ids: torch.Tensor, + encoder_positions: torch.Tensor) -> torch.Tensor: + r""" + Args: + input_ids + Indices of *decoder* input sequence tokens in the vocabulary. + Padding will be ignored by default should you + provide it. + positions + Positions of *decoder* input sequence tokens. + encoder_input_ids + Indices of *encoder* input sequence tokens in the vocabulary. + encoder_positions: + Positions of *encoder* input sequence tokens. + Returns: + Model output torch.Tensor + """ + + encoder_hidden_states = None + + if encoder_input_ids.numel() > 0: + # Run encoder attention if a non-zero number of encoder tokens + # are provided as input + encoder_hidden_states = self.encoder(input_ids=encoder_input_ids, + positions=encoder_positions) + + # decoder outputs consists of + # (dec_features, past_key_value, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + decoder_input_ids=input_ids, + decoder_positions=positions, + encoder_hidden_states=encoder_hidden_states) + + return decoder_outputs + + +class MBartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant): + base_model_prefix = "model" + + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "decoder.": "model.decoder.", + "encoder.": "model.encoder.", + "shared.": "model.shared." + }, + orig_to_new_substr={ + "beta": "bias", + "gamma": "weight", + "LayerNorm": "layernorm", + }, + ) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + lora_config = vllm_config.lora_config + assert config.tie_word_embeddings + self.config = config + self.model = MBartModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + + embed_scale = math.sqrt( + config.d_model) if config.scale_embedding else 1.0 + + self.lm_head = BartParallelLMHead(config.vocab_size, + config.d_model, + embed_scale=embed_scale) + + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + *, + encoder_input_ids: torch.Tensor, + encoder_positions: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + return self.model(input_ids, positions, encoder_input_ids, + encoder_positions) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + model_params_dict = dict(self.named_parameters()) + loaded_params = set() + remaining_weights = [] + shared_embedding_weight = None + + for name, loaded_weight in weights: + if any(skip in name + for skip in ["cls.", "pooler.", "final_logits_bias"]): + continue + if any(embed_name in name for embed_name in [ + 'shared.weight', 'encoder.embed_tokens.weight', + 'decoder.embed_tokens.weight' + ]): + if shared_embedding_weight is None: + shared_embedding_weight = loaded_weight + continue + is_stacked = False + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + vllm_name = name + for src, dst in self.hf_to_vllm_mapper.orig_to_new_substr.items( + ): + vllm_name = vllm_name.replace(src, dst) + for src, dst in self.hf_to_vllm_mapper.orig_to_new_prefix.items( + ): + if vllm_name.startswith(src): + vllm_name = dst + vllm_name[len(src):] + break + vllm_name = vllm_name.replace(weight_name, param_name) + if vllm_name in model_params_dict: + param = model_params_dict[vllm_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight, shard_id) + loaded_params.add(vllm_name) + is_stacked = True + break + if not is_stacked: + remaining_weights.append((name, loaded_weight)) + loader = AutoWeightsLoader(self, skip_prefixes=["cls.", "pooler."]) + auto_loaded_params = loader.load_weights(remaining_weights, + mapper=self.hf_to_vllm_mapper) + loaded_params.update(auto_loaded_params) + if shared_embedding_weight is not None: + lm_head_param = self.lm_head.weight + weight_loader = getattr(lm_head_param, "weight_loader", + default_weight_loader) + weight_loader(lm_head_param, shared_embedding_weight) + self.model.encoder.embed_tokens.weight = self.lm_head.weight + self.model.decoder.embed_tokens.weight = self.lm_head.weight + loaded_params.update({ + 'model.encoder.embed_tokens.weight', 'lm_head.weight', + 'model.decoder.embed_tokens.weight' + }) + return loaded_params diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index b817615b43..109bc1fe5c 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -141,6 +141,7 @@ _TEXT_GENERATION_MODELS = { # [Encoder-decoder] "BartModel": ("bart", "BartForConditionalGeneration"), "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"), + "MBartForConditionalGeneration": ("bart", "MBartForConditionalGeneration"), } _EMBEDDING_MODELS = { From 52ce1420e9f6f52308f49a2898433a52674a4a8b Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Sat, 16 Aug 2025 14:36:30 -0300 Subject: [PATCH 335/932] Fix handling of `max_num_batched_tokens` for pooling tasks (#23004) Signed-off-by: Max de Bayser --- vllm/config/__init__.py | 3 --- vllm/engine/arg_utils.py | 10 +++++----- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 72fec5e205..14fc5589a8 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3600,9 +3600,6 @@ class VllmConfig: logger.info(reason) self.scheduler_config.chunked_prefill_enabled = False self.scheduler_config.long_prefill_token_threshold = 0 - self.scheduler_config.max_num_batched_tokens = max( - self.scheduler_config.max_model_len, - DEFAULT_MAX_NUM_BATCHED_TOKENS) if self.cache_config is not None: self.cache_config.enable_prefix_caching = False diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f8af6d36e0..630fbec453 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1602,9 +1602,6 @@ class EngineArgs: self.enable_prefix_caching = incremental_prefill_supported logger.info("(%s) prefix caching by default", action) - if not self.enable_chunked_prefill: - self.max_num_batched_tokens = model_config.max_model_len - # V1 should use the new scheduler by default. # Swap it only if this arg is set to the original V0 default if self.scheduler_cls == EngineArgs.scheduler_cls: @@ -1692,8 +1689,11 @@ class EngineArgs: self.max_num_batched_tokens = \ default_max_num_batched_tokens[usage_context] else: - self.max_num_batched_tokens = default_max_num_batched_tokens[ - usage_context] + if not self.enable_chunked_prefill: + self.max_num_batched_tokens = model_config.max_model_len + else: + self.max_num_batched_tokens = \ + default_max_num_batched_tokens[usage_context] logger.debug( "Setting max_num_batched_tokens to %d for %s usage context.", self.max_num_batched_tokens, use_context_value) From 68373d3126b4d2c49a9983fe0696bbd48fc8aad7 Mon Sep 17 00:00:00 2001 From: Woonggi Min Date: Sun, 17 Aug 2025 02:38:42 +0900 Subject: [PATCH 336/932] [Frontend] Added support for HermesToolParser for models without special tokens (#16890) Signed-off-by: minpeter --- .../tool_parsers/test_hermes_tool_parser.py | 127 ++++++++++++++++++ .../openai/tool_parsers/hermes_tool_parser.py | 81 ++++++++--- 2 files changed, 191 insertions(+), 17 deletions(-) create mode 100644 tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py new file mode 100644 index 0000000000..28b1f8358d --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py @@ -0,0 +1,127 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json + +import pytest + +from ....utils import RemoteOpenAIServer + +MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" +LORA_MODEL = "minpeter/LoRA-Llama-3.2-1B-tool-vllm-ci" + +SERVER_ARGS = [ + "--enforce-eager", + "--enable-auto-tool-choice", + "--tool-call-parser", + "hermes", + "--enable-lora", + "--lora-modules", + f"{LORA_MODEL}={LORA_MODEL}", +] + +TOOLS = [{ + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": + "The city and state, e.g. San Francisco, CA", + }, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"] + }, + }, + "required": ["location"], + }, + }, +}] + +MESSAGES = [{"role": "user", "content": "What's the weather like in Boston?"}] + + +@pytest.mark.asyncio +async def test_non_streaming_tool_call(): + """Test tool call in non-streaming mode.""" + with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server: + client = server.get_async_client() + + response = await client.chat.completions.create( + model=LORA_MODEL, + messages=MESSAGES, + tools=TOOLS, + tool_choice="auto", + temperature=0.0, + ) + + assert response.choices + choice = response.choices[0] + message = choice.message + + assert choice.finish_reason == "tool_calls" + assert message.tool_calls is not None + + tool_call = message.tool_calls[0] + assert tool_call.type == "function" + assert tool_call.function.name == "get_current_weather" + + arguments = json.loads(tool_call.function.arguments) + assert "location" in arguments + assert "Boston" in arguments["location"] + print("\n[Non-Streaming Test Passed]") + print(f"Tool Call: {tool_call.function.name}") + print(f"Arguments: {arguments}") + + +@pytest.mark.asyncio +async def test_streaming_tool_call(): + """Test tool call in streaming mode.""" + with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server: + client = server.get_async_client() + + stream = await client.chat.completions.create( + model=LORA_MODEL, + messages=MESSAGES, + tools=TOOLS, + tool_choice="auto", + temperature=0.0, + stream=True, + ) + + tool_call_chunks = {} + async for chunk in stream: + if not chunk.choices: + continue + + delta = chunk.choices[0].delta + if not delta or not delta.tool_calls: + continue + + for tool_chunk in delta.tool_calls: + index = tool_chunk.index + if index not in tool_call_chunks: + tool_call_chunks[index] = {"name": "", "arguments": ""} + + if tool_chunk.function.name: + tool_call_chunks[index]["name"] += tool_chunk.function.name + if tool_chunk.function.arguments: + tool_call_chunks[index][ + "arguments"] += tool_chunk.function.arguments + + assert len(tool_call_chunks) == 1 + reconstructed_tool_call = tool_call_chunks[0] + + assert reconstructed_tool_call["name"] == "get_current_weather" + + arguments = json.loads(reconstructed_tool_call["arguments"]) + assert "location" in arguments + assert "Boston" in arguments["location"] + print("\n[Streaming Test Passed]") + print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}") + print(f"Reconstructed Arguments: {arguments}") diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index c7030d34d4..d126130ab9 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -52,14 +52,51 @@ class Hermes2ProToolParser(ToolParser): raise ValueError( "The model tokenizer must be passed to the ToolParser " "constructor during construction.") - self.tool_call_start_token_id = self.vocab.get( - self.tool_call_start_token) - self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) - if (self.tool_call_start_token_id is None - or self.tool_call_end_token_id is None): - raise RuntimeError( - "Hermes 2 Pro Tool parser could not locate tool call start/end " - "tokens in the tokenizer!") + self.tool_call_start_token_ids = self.model_tokenizer.encode( + self.tool_call_start_token, add_special_tokens=False) + self.tool_call_end_token_ids = self.model_tokenizer.encode( + self.tool_call_end_token, add_special_tokens=False) + + self.tool_call_start_token_array = [ + self.model_tokenizer.decode([token_id]) + for token_id in self.tool_call_start_token_ids + ] + + self.tool_call_end_token_array = [ + self.model_tokenizer.decode([token_id]) + for token_id in self.tool_call_end_token_ids + ] + + self.buffered_delta_text = "" + + # Very simple idea: when encountering tokens like <, tool, _call, >, + # <, /, tool, _call, >, store them in a buffer. + # When the last token is encountered, empty the buffer and return it. + # If a token appears in an incorrect sequence while storing in the buffer, + # return the preceding buffer along with the token. + def tool_call_delta_buffer(self, delta_text: str): + # If the sequence of tool_call_start or tool_call_end tokens is not yet + # complete, fill the buffer with the token and return "". + if (delta_text in self.tool_call_start_token_array + or delta_text in self.tool_call_end_token_array): + # If delta_text is the last token of tool_call_start_token or + # tool_call_end_token, empty the buffer and return + # the buffered text + delta_text. + if (delta_text == self.tool_call_start_token_array[-1] + or delta_text == self.tool_call_end_token_array[-1]): + buffered_text = self.buffered_delta_text + self.buffered_delta_text = "" + return buffered_text + delta_text + else: + self.buffered_delta_text = self.buffered_delta_text + delta_text + return "" + else: + if self.buffered_delta_text: + buffered_text = self.buffered_delta_text + self.buffered_delta_text = "" + return buffered_text + delta_text + else: + return delta_text def extract_tool_calls( self, @@ -124,11 +161,23 @@ class Hermes2ProToolParser(ToolParser): delta_token_ids: Sequence[int], request: ChatCompletionRequest, ) -> Union[DeltaMessage, None]: + # 1. All tokens are parsed based on _text, not token_ids. + # 2. All incoming text data is processed by the tool_call_delta_buffer + # function for buffering before being used for parsing. + + delta_text = self.tool_call_delta_buffer(delta_text) + # If the last characters of previous_text + # match self.buffered_delta_text, remove only the matching part. + if (len(previous_text) >= len(self.buffered_delta_text) + and previous_text[-len(self.buffered_delta_text):] + == self.buffered_delta_text): + previous_text = previous_text[:-len(self.buffered_delta_text)] + current_text = previous_text + delta_text logger.debug("delta_text: %s", delta_text) logger.debug("delta_token_ids: %s", delta_token_ids) # check to see if we should be streaming a tool call - is there a - if self.tool_call_start_token_id not in current_token_ids: + if self.tool_call_start_token not in current_text: logger.debug("No tool call tokens found!") return DeltaMessage(content=delta_text) @@ -136,14 +185,12 @@ class Hermes2ProToolParser(ToolParser): # figure out where we are in the parsing by counting tool call # start & end tags - prev_tool_start_count = previous_token_ids.count( - self.tool_call_start_token_id) - prev_tool_end_count = previous_token_ids.count( - self.tool_call_end_token_id) - cur_tool_start_count = current_token_ids.count( - self.tool_call_start_token_id) - cur_tool_end_count = current_token_ids.count( - self.tool_call_end_token_id) + prev_tool_start_count = previous_text.count( + self.tool_call_start_token) + prev_tool_end_count = previous_text.count(self.tool_call_end_token) + cur_tool_start_count = current_text.count( + self.tool_call_start_token) + cur_tool_end_count = current_text.count(self.tool_call_end_token) tool_call_portion = None text_portion = None From 000cceca8c329d5b5d99e0186fbd444a390384cd Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 16 Aug 2025 14:16:00 -0400 Subject: [PATCH 337/932] [Bugfix gpt-oss] Fix float32 convert for flashinfer sink support (#23016) Signed-off-by: mgoin --- vllm/attention/layer.py | 9 +++++++++ vllm/v1/attention/backends/flashinfer.py | 3 --- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 1a9c0e26b5..0e87fa3f23 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -308,6 +308,15 @@ class Attention(nn.Module): if hasattr(self.impl, "process_weights_after_loading"): self.impl.process_weights_after_loading(act_dtype) + # FlashInfer requires attention sinks to be float32 + if (self.backend == _Backend.FLASHINFER_VLLM_V1 + and hasattr(self.impl, 'sinks')): + from vllm.v1.attention.backends.flashinfer import FlashInferImpl + assert isinstance(self.impl, FlashInferImpl) + if (self.impl.sinks is not None + and self.impl.sinks.dtype != torch.float32): + self.impl.sinks = self.impl.sinks.to(torch.float32) + def get_attn_backend(self) -> type[AttentionBackend]: return self.attn_backend diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index eac3f33e15..991904229f 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -642,9 +642,6 @@ class FlashInferImpl(AttentionImpl): f"heads in the layer. Expected {num_heads}, but got " f"{sinks.shape[0]}." ) - # Cast sinks to float32 if needed (FlashInfer requirement) - if sinks.dtype != torch.float32: - sinks = sinks.to(torch.float32) self.sinks = sinks def forward( From 3253ae765ef4dc0604a6f3ed3a1dcd61fdda6bda Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 16 Aug 2025 14:33:08 -0400 Subject: [PATCH 338/932] [Flaky CI] Increase timeout tolerance for test_mp_crash_detection+test_default_mm_lora_chat_completions (#23028) Signed-off-by: mgoin --- tests/entrypoints/openai/test_default_mm_loras.py | 3 ++- tests/mq_llm_engine/test_error_handling.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/openai/test_default_mm_loras.py b/tests/entrypoints/openai/test_default_mm_loras.py index 372e9b1fec..b9c466a6fb 100644 --- a/tests/entrypoints/openai/test_default_mm_loras.py +++ b/tests/entrypoints/openai/test_default_mm_loras.py @@ -48,7 +48,8 @@ def multimodal_server(): # noqa: F811 f"{{\"audio\": \"{AUDIO_LORA_PATH}\"}}", ] - with RemoteOpenAIServer(MULTIMODAL_MODEL_NAME, args) as remote_server: + with RemoteOpenAIServer(MULTIMODAL_MODEL_NAME, args, + max_wait_seconds=480) as remote_server: yield remote_server diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py index 3feee01dad..77e3732cd0 100644 --- a/tests/mq_llm_engine/test_error_handling.py +++ b/tests/mq_llm_engine/test_error_handling.py @@ -255,8 +255,8 @@ async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch): pass end = time.perf_counter() - assert end - start < 60, ( - "Expected vLLM to gracefully shutdown in <60s " + assert end - start < 100, ( + "Expected vLLM to gracefully shutdown in <100s " "if there is an error in the startup.") From 4fc722eca4f6ad63edf1936989f4d2171aab3ca2 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 16 Aug 2025 15:38:21 -0400 Subject: [PATCH 339/932] [Kernel/Quant] Remove AQLM (#22943) Signed-off-by: mgoin Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- .../scripts/hardware_ci/run-amd-test.sh | 1 - CMakeLists.txt | 1 - benchmarks/kernels/benchmark_aqlm.py | 345 ---------- csrc/ops.h | 9 - csrc/quantization/aqlm/gemm_kernels.cu | 597 ------------------ csrc/torch_bindings.cpp | 15 - .../quantization/supported_hardware.md | 1 - docs/mkdocs/hooks/generate_examples.py | 1 - examples/offline_inference/basic/README.md | 14 - tests/compile/test_full_graph.py | 4 - tests/kernels/quantization/test_aqlm.py | 40 -- tests/models/quantization/test_aqlm.py | 68 -- vllm/_custom_ops.py | 41 -- vllm/model_executor/layers/linear.py | 18 - .../layers/quantization/__init__.py | 3 - .../layers/quantization/aqlm.py | 376 ----------- 16 files changed, 1534 deletions(-) delete mode 100644 benchmarks/kernels/benchmark_aqlm.py delete mode 100644 csrc/quantization/aqlm/gemm_kernels.cu delete mode 100644 tests/kernels/quantization/test_aqlm.py delete mode 100644 tests/models/quantization/test_aqlm.py delete mode 100644 vllm/model_executor/layers/quantization/aqlm.py diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 5e5a532cb5..df0bae0c9c 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -121,7 +121,6 @@ fi if [[ $commands == *" kernels/quantization"* ]]; then commands="${commands} \ --ignore=kernels/quantization/test_int8_quant.py \ - --ignore=kernels/quantization/test_aqlm.py \ --ignore=kernels/quantization/test_machete_mm.py \ --ignore=kernels/quantization/test_block_fp8.py \ --ignore=kernels/quantization/test_block_int8.py \ diff --git a/CMakeLists.txt b/CMakeLists.txt index cda1ffc795..34386d670a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -286,7 +286,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") FetchContent_MakeAvailable(cutlass) list(APPEND VLLM_EXT_SRC - "csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu" "csrc/permute_cols.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py deleted file mode 100644 index 42de062b08..0000000000 --- a/benchmarks/kernels/benchmark_aqlm.py +++ /dev/null @@ -1,345 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os -import sys -from typing import Optional - -import torch -import torch.nn.functional as F - -from vllm import _custom_ops as ops -from vllm.model_executor.layers.quantization.aqlm import ( - dequantize_weight, - generic_dequantize_gemm, - get_int_dtype, - optimized_dequantize_gemm, -) -from vllm.utils import FlexibleArgumentParser - -os.environ["CUDA_VISIBLE_DEVICES"] = "0" - - -def torch_mult( - # [..., in_features] - input: torch.Tensor, - weights: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, -) -> torch.Tensor: - output = F.linear(input, weights) - return output - - -def dequant_out_scale( - # [..., in_features] - input: torch.Tensor, - # [num_out_groups, num_in_groups, num_codebooks] - codes: torch.IntTensor, - # [num_codebooks, codebook_size, out_group_size, in_group_size] - codebooks: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, - output_partition_sizes: torch.IntTensor, - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - if bias is None: - output = F.linear(input, weights, bias) - orig_shape = output.shape - flattened_output = output.view(-1, output.size(-1)) - f_scales = scales.view(-1, scales.shape[0]) - b_scales = f_scales.expand(flattened_output.shape[0], -1) - flattened_output *= b_scales - return flattened_output.view(orig_shape) - else: - b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1]) - weights *= b_scales - return F.linear(input, weights, bias) - - -def dequant_weight_scale( - # [..., in_features] - input: torch.Tensor, - # [num_out_groups, num_in_groups, num_codebooks] - codes: torch.IntTensor, - # [num_codebooks, codebook_size, out_group_size, in_group_size] - codebooks: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, - output_partition_sizes: torch.IntTensor, - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1]) - weights *= b_scales - return F.linear(input, weights, bias) - - -def dequant_no_scale( - # [..., in_features] - input: torch.Tensor, - # [num_out_groups, num_in_groups, num_codebooks] - codes: torch.IntTensor, - # [num_codebooks, codebook_size, out_group_size, in_group_size] - codebooks: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, - output_partition_sizes: torch.IntTensor, - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - return F.linear(input, weights, bias) - - -# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against -# the generic pytorch version. -# Just visual comparison. -def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None: - n = int(parts.sum().item()) - - device = torch.device("cuda:0") - - code_range = (1 << bits) // 2 - ingroups = 8 - - codes = torch.randint( - -code_range, - code_range, - size=(n, k // ingroups, nbooks), - dtype=get_int_dtype(bits), - device=device, - ) - - codebooks = torch.randn( - size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), - dtype=torch.float16, - device=device, - ) - - count = 0 - for index in range(16): - for i in range(8): - for book in range(nbooks): - codebooks[book, index, 0, i] = count * (10**book) - count += 1 - - print("codes shape", codes.shape) - - for i in range(16): - for book in range(nbooks): - codes[0, i, book] = i - codes[0, -i, book] = i - - weights = dequantize_weight(codes, codebooks, None) - weights2 = ops.aqlm_dequant(codes, codebooks, parts) - - print("weights shape:", weights.shape) - print("weights2 shape:", weights2.shape) - - print("weights are:", weights) - print("weights2 are:", weights2) - - print("first 128 weights are", weights[0, 0:128].to(torch.int32)) - print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32)) - - print("last 128 weights are", weights[0, -128:]) - print("last 128 weights2 are:", weights2[0, -128:]) - - -def main(): - parser = FlexibleArgumentParser(description="Benchmark aqlm performance.") - - # Add arguments - parser.add_argument( - "--nbooks", type=int, default=1, help="Number of codebooks (default: 1)" - ) - parser.add_argument( - "--bits", - type=int, - default=16, - help="Number of bits per code element (default: 16)", - ) - parser.add_argument( - "--test", - type=bool, - default=False, - help="Run the decompression/dequant tester rather than benchmarking " - "(default: False)", - ) - - # Parse the arguments - args = parser.parse_args() - - # Extract values - nbooks = args.nbooks - bits = args.bits - - if args.test: - dequant_test(4096, torch.tensor((4096,)), nbooks, bits) - return - - # Otherwise, benchmark. - methods = [ - ops.aqlm_gemm, - dequant_out_scale, - generic_dequantize_gemm, - optimized_dequantize_gemm, - dequant_weight_scale, - torch_mult, - dequant_no_scale, - ] - - filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv" - print(f"writing benchmarks to file {filename}") - with open(filename, "w") as f: - sys.stdout = f - - print("m | k | n | n parts", end="") - for method in methods: - print(f" | {method.__name__.replace('_', ' ')} (µs)", end="") - print("") - - # These are reasonable prefill sizes. - ksandpartions = ( - (4096, (4096, 4096, 4096)), - (4096, (4096,)), - (4096, (11008, 11008)), - (11008, (4096,)), - ) - - # reasonable ranges for m. - for m in [ - 1, - 2, - 4, - 8, - 10, - 12, - 14, - 16, - 24, - 32, - 48, - 52, - 56, - 64, - 96, - 112, - 128, - 256, - 512, - 1024, - 1536, - 2048, - 3072, - 4096, - ]: - print(f"{m}", file=sys.__stdout__) - for ksp in ksandpartions: - run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, methods) - - sys.stdout = sys.__stdout__ - - -def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods): - # I didn't see visible improvements from increasing these, but feel free :) - num_warmup_trials = 1 - num_trials = 1 - - num_calls = 100 - - # warmup. - for method in methods: - for _ in range(num_warmup_trials): - run_timing( - num_calls=num_calls, - m=m, - k=k, - parts=parts, - nbooks=nbooks, - bits=bits, - method=method, - ) - - n = parts.sum().item() - print(f"{m} | {k} | {n} | {parts.tolist()}", end="") - - for method in methods: - best_time_us = 1e20 - for _ in range(num_trials): - kernel_dur_ms = run_timing( - num_calls=num_calls, - m=m, - k=k, - parts=parts, - nbooks=nbooks, - bits=bits, - method=method, - ) - - kernel_dur_us = 1000 * kernel_dur_ms - - if kernel_dur_us < best_time_us: - best_time_us = kernel_dur_us - - print(f" | {kernel_dur_us:.0f}", end="") - - print("") - - -def run_timing( - num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method -) -> float: - n = int(parts.sum().item()) - - device = torch.device("cuda:0") - - input = torch.randn((1, m, k), dtype=torch.float16, device=device) - - code_range = (1 << bits) // 2 - ingroups = 8 - - codes = torch.randint( - -code_range, - code_range, - size=(n, k // ingroups, nbooks), - dtype=get_int_dtype(bits), - device=device, - ) - - codebooks = torch.randn( - size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), - dtype=torch.float16, - device=device, - ) - - scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device) - - # for comparison to just a pytorch mult. - weights = torch.randn((n, k), dtype=torch.float16, device=device) - - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - - start_event.record() - - if method is torch_mult: - for i in range(num_calls): - torch_mult(input, weights, scales) - else: - for i in range(num_calls): - method(input, codes, codebooks, scales, parts, None) - - end_event.record() - end_event.synchronize() - - dur_ms = start_event.elapsed_time(end_event) / num_calls - return dur_ms - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/csrc/ops.h b/csrc/ops.h index 3e29f0a973..6e39758f16 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -154,15 +154,6 @@ void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope, torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor); #ifndef USE_ROCM -torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const std::vector& codebook_partition_sizes, - const std::optional& bias); - -torch::Tensor aqlm_dequant( - const torch::Tensor& codes, const torch::Tensor& codebooks, - const std::vector& codebook_partition_sizes); torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel, torch::Tensor _scaling_factors, torch::Tensor _zeros, diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu deleted file mode 100644 index 79cd2c610b..0000000000 --- a/csrc/quantization/aqlm/gemm_kernels.cu +++ /dev/null @@ -1,597 +0,0 @@ -/* - * Modified by Neural Magic - * Adapted from https://github.com/Vahe1994/AQLM - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include - -namespace vllm { -namespace aqlm { - -__global__ void Code1x16MatVec( - const int4* __restrict__ A, const int4* __restrict__ B, - int4* __restrict__ C, const int4* __restrict__ codebook, const int prob_m, - const int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each - // codebook, at most 3 long. - const int codebook_stride // as int4. -) { - int a_gl_stride = prob_k / 8 / 8; - int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - bool pred = a_gl_rd < prob_m; - - if (pred) { - // advance to the correct codebook, this easy because we only multiply one - // column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) { - codebook += codebook_stride; - ++codebook_size; - } - } - - int b_gl_rd = 0; - int c_gl_wr = a_gl_rd; - a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; - int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; - - __shared__ int4 sh_b[32 * 9]; - float res = 0; - - int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32); - while (iters--) { - // We pad shared memory to avoid bank conflicts during reads - __syncthreads(); - for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) { - if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i]; - } - __syncthreads(); - b_gl_rd += 32 * 8; - - int b_sh_rd = 9 * (threadIdx.x % 32); - if (pred && a_gl_rd < a_gl_end) { - const uint16_t* enc = reinterpret_cast(&A[a_gl_rd]); -#pragma unroll - for (int i = 0; i < 8; i++) { - uint32_t dec[4]; - // We bypass the L1 cache to avoid massive amounts of memory streaming - // that doesn't actually help us; this brings > 2x speedup. - asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3]) - : "l"((void*)&codebook[enc[i]])); - half2* a = reinterpret_cast(&dec); - half2* b = reinterpret_cast(&sh_b[b_sh_rd]); - half2 res2 = {}; -#pragma unroll - for (int j = 0; j < 4; j++) res2 = __hfma2(a[j], b[j], res2); - res += __half2float(res2.x) + __half2float(res2.y); - b_sh_rd++; - } - a_gl_rd += 32; - } - } - - if (pred) { -#pragma unroll - for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i); - if (threadIdx.x % 32 == 0) - reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res); - } -} - -__global__ void Code2x8MatVec( - const int4* __restrict__ A, const int4* __restrict__ B, - int4* __restrict__ C, const int4* __restrict__ codebook, int prob_m, - int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each - // codebook, at most 3 long. - const int codebook_stride // as int4. - -) { - int a_gl_stride = prob_k / 8 / 8; - int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - bool pred = a_gl_rd < prob_m; - - if (pred) { - // advance to the correct codebook, this easy because we only multiply one - // column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) { - codebook += codebook_stride; - ++codebook_size; - } - } - - int b_gl_rd = 0; - int c_gl_wr = a_gl_rd; - a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; - int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; - int lane = threadIdx.x % 8; - - extern __shared__ int4 sh[]; - int4* sh_b = sh; - int4* sh_code = sh_b + 32 * 9; - int4* sh_code0 = sh_code; - int4* sh_code1 = sh_code + 256 * 8; - - for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) { - int4 dec = codebook[i]; -#pragma unroll - for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec; - } - __syncthreads(); - - float res = 0; - - int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32); - while (iters--) { - // We pad shared memory to avoid bank conflicts during reads - __syncthreads(); - for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) { - if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i]; - } - __syncthreads(); - b_gl_rd += 32 * 8; - - int b_sh_rd = 9 * (threadIdx.x % 32); - if (pred && a_gl_rd < a_gl_end) { - const uint8_t* enc = reinterpret_cast(&A[a_gl_rd]); -#pragma unroll - for (int i = 0; i < 8; i++) { - half2* a0 = - reinterpret_cast(&sh_code0[8 * enc[2 * i + 0] + lane]); - half2* a1 = - reinterpret_cast(&sh_code1[8 * enc[2 * i + 1] + lane]); - half2* b = reinterpret_cast(&sh_b[b_sh_rd]); - half2 res2 = {}; -#pragma unroll - for (int j = 0; j < 4; j++) - res2 = __hfma2(__hadd2(a0[j], a1[j]), b[j], res2); - res += __half2float(res2.x) + __half2float(res2.y); - b_sh_rd++; - } - a_gl_rd += 32; - } - } - - if (pred) { -#pragma unroll - for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i); - if (threadIdx.x % 32 == 0) - reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res); - } -} - -__global__ void Code1x16Dequant( - const int4* __restrict__ A, int4* __restrict__ C, - const int4* __restrict__ codebook, int prob_m, int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each - // codebook, at most 3 long, sums to m. - const int codebook_stride // as int4 -) { - int a_gl_stride = prob_k / 8 / 8; - int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - bool pred = a_gl_rd < prob_m; - - if (pred) { - // advance to the correct codebook, this easy because we only multiply one - // column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) { - codebook += codebook_stride; - ++codebook_size; - } - } - - a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; - int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; - - int c_gl_stride = prob_k / 8; - int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8; - - int iters = (prob_k / 8 - 1) / (8 * 32) + 1; - while (iters--) { - if (pred && a_gl_rd < a_gl_end) { - const uint16_t* enc = reinterpret_cast(&A[a_gl_rd]); -#pragma unroll - for (int i = 0; i < 8; i++) { - int4 chunk; - auto dec = reinterpret_cast(&chunk); - // We bypass the L1 cache to avoid massive amounts of memory streaming - // that doesn't actually help us; this brings > 2x speedup. - asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3]) - : "l"((void*)&codebook[enc[i]])); - - C[a_gl_rd * 8 + i] = chunk; - } - } - a_gl_rd += 32; - } -} - -__global__ void Code2x8Dequant( - const int4* __restrict__ A, int4* __restrict__ C, - const int4* __restrict__ codebook, int prob_m, int prob_k, - const int4 - codebook_a_sizes, // cumulative sizes of A spanning each codebook, at - // most 3 long, corresponds to cols. - const int codebook_stride // as int4 -) { - int a_gl_stride = prob_k / 8 / 8; - int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - bool pred = a_gl_rd < prob_m; - - if (pred) { - // advance to the correct codebook, this easy because we only multiply one - // column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) { - codebook += codebook_stride; - ++codebook_size; - } - } - - a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; - int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; - int lane = threadIdx.x % 8; - - int c_gl_stride = prob_k / 8; - int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8; - - extern __shared__ int4 sh[]; - int4* sh_code = sh; - int4* sh_code0 = sh_code; - int4* sh_code1 = sh_code + 256 * 8; - - for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) { - int4 dec = codebook[i]; -#pragma unroll - for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec; - } - __syncthreads(); - - int iters = (prob_k / 8 - 1) / (8 * 32) + 1; - while (iters--) { - if (pred && a_gl_rd < a_gl_end) { - const uint8_t* enc = reinterpret_cast(&A[a_gl_rd]); -#pragma unroll - for (int i = 0; i < 8; i++) { - int4 chunk; - half2* a0 = - reinterpret_cast(&sh_code0[8 * enc[2 * i + 0] + lane]); - half2* a1 = - reinterpret_cast(&sh_code1[8 * enc[2 * i + 1] + lane]); -#pragma unroll - for (int j = 0; j < 4; j++) - reinterpret_cast(&chunk)[j] = __hadd2(a0[j], a1[j]); - C[a_gl_rd * 8 + i] = chunk; - } - } - a_gl_rd += 32; - } -} - -inline int ceildiv(int a, int b) { return (a + b - 1) / b; } - -const int THREAD_M = 16; - -void code1x16_matvec_cuda(const void* __restrict__ A, - const void* __restrict__ B, void* __restrict__ C, - const void* __restrict__ codebook, int prob_m, - int prob_k, const int4 codebook_a_sizes, - const int codebook_stride) { - int sms; - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); - int waves = 0; - int thread_m; - do { - waves++; - thread_m = ceildiv(prob_m, waves * sms); - } while (thread_m > THREAD_M); - - int blocks = ceildiv(prob_m, thread_m); - int threads = 32 * thread_m; - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - Code1x16MatVec<<>>( - (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m, - prob_k, codebook_a_sizes, codebook_stride); -} - -void code2x8_matvec_cuda(const void* __restrict__ A, const void* __restrict__ B, - void* __restrict__ C, - const void* __restrict__ codebook, int prob_m, - int prob_k, const int4 codebook_a_sizes, - const int codebook_stride) { - int sms; - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); - int waves = 0; - int thread_m; - do { - waves++; - thread_m = ceildiv(prob_m, waves * sms); - } while (thread_m > THREAD_M); - - int blocks = ceildiv(prob_m, thread_m); - int threads = 32 * thread_m; - int shared = 16 * (2 * 256 * 8 + 32 * 9); - cudaFuncSetAttribute(Code2x8MatVec, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared); - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - Code2x8MatVec<<>>( - (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m, - prob_k, codebook_a_sizes, codebook_stride); -} - -void code1x16_dequant_cuda( - const void* __restrict__ A, void* __restrict__ C, - const void* __restrict__ codebook, int prob_m, int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each - // codebook, at most 3 long. - const int codebook_stride // as int4. -) { - int sms; - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); - int waves = 0; - int thread_m; - do { - waves++; - thread_m = ceildiv(prob_m, waves * sms); - } while (thread_m > THREAD_M); - - int blocks = ceildiv(prob_m, thread_m); - int threads = 32 * thread_m; - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - Code1x16Dequant<<>>( - (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k, - codebook_a_sizes, // cumulative sizes of A spanning each codebook, at - // most 3 long. - codebook_stride // as int4. - ); -} - -// Dequantizes the code and codebook into weights. -void code2x8_dequant_cuda( - const void* __restrict__ A, void* __restrict__ C, - const void* __restrict__ codebook, int prob_m, int prob_k, - const int4 - codebook_a_sizes, // cumulative sizes of A spanning each codebook, at - // most 3 long, corresponds to cols. - const int codebook_stride // as int4 -) { - int sms; - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); - int waves = 0; - int thread_m; - do { - waves++; - thread_m = ceildiv(prob_m, waves * sms); - } while (thread_m > THREAD_M); - - int blocks = ceildiv(prob_m, thread_m); - int threads = 32 * thread_m; - int shared = 16 * (2 * 256 * 8 + 32 * 9); - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - - cudaFuncSetAttribute(Code2x8Dequant, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared); - Code2x8Dequant<<>>( - (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k, - codebook_a_sizes, codebook_stride); -} - -int codebook_stride(const torch::Tensor& codebooks) { - return codebooks.stride(0) * codebooks.element_size() / sizeof(int4); -} - -void code1x16_matvec( - const torch::Tensor& A, const torch::Tensor& B, torch::Tensor& C, - const torch::Tensor& codebook, - const int4 codebook_a_sizes // cumulative sizes of A spanning each - // codebook, at most 3 long. -) { - const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); - int prob_m = C.size(0); - int prob_k = B.size(0); - - code1x16_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(), - codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes, - codebook_stride(codebook)); -} - -torch::Tensor code1x16_matmat(const torch::Tensor& input, - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const int4 codebook_a_sizes, - const std::optional& bias) { - auto input_sizes = input.sizes(); - auto out_features = codes.size(0) * codebooks.size(2); - auto flat_input = input.reshape({-1, input.size(-1)}); - auto flat_output = torch::empty( - {flat_input.size(0), out_features}, - torch::TensorOptions().dtype(input.dtype()).device(input.device())); - - for (int i = 0; i < flat_input.size(0); ++i) { - auto input_vec = flat_input.index({i}); - auto output_vec = flat_output.index({i}); - code1x16_matvec(codes.squeeze(2), input_vec, output_vec, codebooks, - codebook_a_sizes); - } - flat_output *= scales.flatten().unsqueeze(0); - - if (bias.has_value()) { - flat_output += bias->unsqueeze(0); - } - - auto output_sizes = input_sizes.vec(); - output_sizes.pop_back(); - output_sizes.push_back(-1); - auto output = flat_output.reshape(output_sizes); - return output; -} - -void code2x8_matvec(const torch::Tensor& A, const torch::Tensor& B, - torch::Tensor& C, const torch::Tensor& codebook, - const int4 codebook_a_sizes) { - const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); - int prob_m = C.size(0); - int prob_k = B.size(0); - code2x8_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(), - codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes, - 2 * codebook_stride(codebook)); -} - -torch::Tensor code2x8_matmat(const torch::Tensor& input, - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const int4 codebook_a_sizes, - const std::optional& bias) { - auto input_sizes = input.sizes(); - auto out_features = codes.size(0) * codebooks.size(2); - auto flat_input = input.reshape({-1, input.size(-1)}); - auto flat_output = torch::empty( - {flat_input.size(0), out_features}, - torch::TensorOptions().dtype(input.dtype()).device(input.device())); - - for (int i = 0; i < flat_input.size(0); ++i) { - auto input_vec = flat_input.index({i}); - auto output_vec = flat_output.index({i}); - code2x8_matvec(codes.squeeze(2), input_vec, output_vec, codebooks, - codebook_a_sizes); - } - flat_output *= scales.flatten().unsqueeze(0); - if (bias.has_value()) { - flat_output += bias->unsqueeze(0); - } - - auto output_sizes = input_sizes.vec(); - output_sizes.pop_back(); - output_sizes.push_back(-1); - auto output = flat_output.reshape(output_sizes); - return output; -} - -// Accumulate the partition sizes. -int4 accumulate_sizes(const std::vector& codebook_partition_sizes) { - int4 cumulative_sizes; - auto cumulative_size = &cumulative_sizes.x; - size_t i = 0; - int last = 0; - assert(codebook_partition_sizes.size() <= 4); - for (; i < codebook_partition_sizes.size(); ++i, ++cumulative_size) { - *cumulative_size = codebook_partition_sizes[i] + last; - last = *cumulative_size; - } - // fill in the rest with unreachable. - for (; i < 4; ++i, ++cumulative_size) { - *cumulative_size = last * 10; - } - return cumulative_sizes; -} - -} // namespace aqlm -} // namespace vllm - -torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const std::vector& codebook_partition_sizes, - const std::optional& bias) { - int4 cumulative_sizes = - vllm::aqlm::accumulate_sizes(codebook_partition_sizes); - - int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(); - int const entries = codebooks.size(1); - - if (nbooks == 1 && entries == (1 << 16)) { - return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales, - cumulative_sizes, bias); - } - if (nbooks == 2 && entries == (1 << 8)) { - return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales, - cumulative_sizes, bias); - } - - TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, - " entries is not currently supported.") - return {}; -} - -torch::Tensor aqlm_dequant( - const torch::Tensor& codes, const torch::Tensor& codebooks, - const std::vector& codebook_partition_sizes) { - int4 cumulative_sizes = - vllm::aqlm::accumulate_sizes(codebook_partition_sizes); - - int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(); - int const entries = codebooks.size(1); - - const at::cuda::OptionalCUDAGuard device_guard(device_of(codes)); - int rows = codes.size(1); - int cols = codes.size(0); - - auto in_features = codes.size(1) * 8; - auto out_features = codes.size(0); - - assert(out_features == std::accumulate(codebook_partition_sizes.begin(), - codebook_partition_sizes.end(), 0)); - - auto weights = torch::empty({out_features, in_features}, - torch::TensorOptions() - .dtype(codebooks.dtype()) - .device(codebooks.device())); - - if (nbooks == 1 && entries == (1 << 16)) { - vllm::aqlm::code1x16_dequant_cuda(codes.data_ptr(), weights.data_ptr(), - codebooks.data_ptr(), out_features, - in_features, cumulative_sizes, - vllm::aqlm::codebook_stride(codebooks)); - - // if you wanted to flip to scaling the weights, (though it's 30%-ish slower - // and not consistent with gemv implementation.) weights *= - // scales.index({"...", 0, 0}); - - return weights; - } - - if (nbooks == 2 && entries == (1 << 8)) { - vllm::aqlm::code2x8_dequant_cuda(codes.data_ptr(), weights.data_ptr(), - codebooks.data_ptr(), out_features, - in_features, cumulative_sizes, - vllm::aqlm::codebook_stride(codebooks)); - - // if you wanted to flip to scaling the weights, (though it's 30%-ish slower - // and not consistent with gemv implementation) weights *= - // scales.index({"...", 0, 0}); - - return weights; - } - - TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, - " entries is not currently supported.") - return {}; -} diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index a547baec50..5fee106335 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -207,21 +207,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Quantization ops #ifndef USE_ROCM - // Quantized GEMM for AQLM. - ops.def( - "aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, " - "Tensor scales, int[] codebook_partition_sizes, Tensor? bias) " - "-> Tensor", - {stride_tag}); - ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm); - - // Decompression method for AQLM. - ops.def( - "aqlm_dequant(Tensor codes, Tensor codebooks, " - "int[] codebook_partition_sizes) -> Tensor", - {stride_tag}); - ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant); - // Quantized GEMM for AWQ. ops.def( "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, " diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md index f53e69ecc6..06264d08b5 100644 --- a/docs/features/quantization/supported_hardware.md +++ b/docs/features/quantization/supported_hardware.md @@ -17,7 +17,6 @@ th { | INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | | FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ❌ | | BitBLAS (GPTQ) | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| AQLM | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py index 6b4c5b3107..1e8b848db4 100644 --- a/docs/mkdocs/hooks/generate_examples.py +++ b/docs/mkdocs/hooks/generate_examples.py @@ -24,7 +24,6 @@ def fix_case(text: str) -> str: "llm": "LLM", "mae": "MAE", "tpu": "TPU", - "aqlm": "AQLM", "gguf": "GGUF", "lora": "LoRA", "rlhf": "RLHF", diff --git a/examples/offline_inference/basic/README.md b/examples/offline_inference/basic/README.md index 0a2bd6e2b7..cbb3116e97 100644 --- a/examples/offline_inference/basic/README.md +++ b/examples/offline_inference/basic/README.md @@ -52,20 +52,6 @@ Try it yourself with the following argument: ### Quantization -#### AQLM - -vLLM supports models that are quantized using AQLM. - -Try one yourself by passing one of the following models to the `--model` argument: - -- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf` -- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf` -- `ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf` -- `ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf` -- `BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf` - -> Some of these models are likely to be too large for a single GPU. You can split them across multiple GPUs by setting `--tensor-parallel-size` to the number of required GPUs. - #### GGUF vLLM supports models that are quantized using GGUF. diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 72f962ed74..a2fc6ffeb8 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -31,10 +31,6 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None): ] if all: - if is_quant_method_supported("aqlm"): - TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", { - "quantization": "aqlm" - })) # TODO: figure out why this fails. if False and is_quant_method_supported("gguf"): # noqa: SIM223 diff --git a/tests/kernels/quantization/test_aqlm.py b/tests/kernels/quantization/test_aqlm.py deleted file mode 100644 index 427db3e602..0000000000 --- a/tests/kernels/quantization/test_aqlm.py +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -from tests.kernels.utils import opcheck -from vllm import _custom_ops as ops # noqa: F401 - - -def test_aqlm_dequant_opcheck(): - codes = torch.randint(-32768, - 32767, (22016, 512, 1), - device='cuda', - dtype=torch.int16) - codebooks = torch.rand((2, 65536, 1, 8), - device='cuda', - dtype=torch.float16) - codebook_partition_sizes = [11008, 11008] - - opcheck(torch.ops._C.aqlm_dequant, - (codes, codebooks, codebook_partition_sizes)) - - -def test_aqlm_gemm_opcheck(): - input = torch.rand((4, 4096), device='cuda', dtype=torch.float16) - codes = torch.randint(-32768, - 32767, (12288, 512, 1), - device='cuda', - dtype=torch.int16) - codebooks = torch.rand((3, 65536, 1, 8), - device='cuda', - dtype=torch.float16) - scales = torch.rand((12288, 1, 1, 1), device='cuda', dtype=torch.float16) - codebook_partition_sizes = [4096, 4096, 4096] - bias = None - - opcheck(torch.ops._C.aqlm_gemm, - (input, codes, codebooks, scales, codebook_partition_sizes, None)) - opcheck(torch.ops._C.aqlm_gemm, - (input, codes, codebooks, scales, codebook_partition_sizes, bias)) diff --git a/tests/models/quantization/test_aqlm.py b/tests/models/quantization/test_aqlm.py deleted file mode 100644 index de6851e2fc..0000000000 --- a/tests/models/quantization/test_aqlm.py +++ /dev/null @@ -1,68 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - -from tests.quantization.utils import is_quant_method_supported -from vllm.platforms import current_platform - -# These ground truth generations were generated using `transformers==4.38.1 -# aqlm==1.1.0 torch==2.2.0` -# and the below code: -# ```python -# from transformers import AutoTokenizer, AutoModelForCausalLM -# model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf" -# quantized_model = AutoModelForCausalLM.from_pretrained(model_id, -# torch_dtype="auto", device_map="cuda").cuda() -# tokenizer = AutoTokenizer.from_pretrained(model_id) -# outputs = [] -# for prompt in example_prompts: -# input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda") -# hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32) -# outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:])) -# print(outputs) -# ``` -ground_truth_generations = [ - '\n### Features\n\n- **High-throughput**: v', - 'The major milestones in the development of artificial intelligence from ' - '195', - 'Compare and contrast artificial intelligence with human intelligence in ' - 'terms of processing information. The', - 'Explain the difference between supervised and unsupervised learning.' - '\nExplain', - 'Write a short story about a robot that dreams for the first time. The', - 'Analyze the impact of the COVID-19 pandemic on global economic', - 'The Mona Lisa is a painting by Leonardo da Vinci, and it', - 'The early bird catches the worm.\nThe early bird catches the' -] - - -@pytest.mark.skipif(not is_quant_method_supported("aqlm") - or current_platform.is_rocm() - or not current_platform.is_cuda(), - reason="AQLM is not supported on this GPU type.") -@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"]) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [16]) -@pytest.mark.parametrize("num_logprobs", [1]) -def test_models( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: - - with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - # loop through the prompts to compare against the ground truth generations - for prompt_idx in range(len(example_prompts)): - vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[ - prompt_idx] - - print("Prompt: ", repr(example_prompts[prompt_idx])) - print("Reference output:", repr(ground_truth_generations[prompt_idx])) - print("Output output: ", repr(vllm_output_str)) - assert vllm_output_str == ground_truth_generations[prompt_idx] diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index a318637c5a..0d556053f8 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -476,32 +476,6 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"): dtype=input.dtype, device=input.device).sum(0) - @register_fake("_C::aqlm_gemm") - def _aqlm_gemm_fake(input: torch.Tensor, codes: torch.Tensor, - codebooks: torch.Tensor, scales: torch.Tensor, - codebook_partition_sizes: list[int], - bias: Optional[torch.Tensor]) -> torch.Tensor: - out_features = codes.size(0) * codebooks.size(2) - flat_input = input.reshape((-1, input.size(-1))) - flat_output = torch.empty((flat_input.size(0), out_features), - dtype=input.dtype, - device=input.device) - - output_sizes = list(input.shape) - output_sizes.pop() - output_sizes.append(-1) - return flat_output.reshape(tuple(output_sizes)) - - @register_fake("_C::aqlm_dequant") - def _aqlm_dequant_fake( - codes: torch.Tensor, codebooks: torch.Tensor, - codebook_partition_sizes: list[int]) -> torch.Tensor: - in_features = codes.size(1) * 8 - out_features = codes.size(0) - return torch.empty((out_features, in_features), - dtype=codebooks.dtype, - device=codebooks.device) - @register_fake("_C::machete_mm") def machete_mm_fake( a: torch.Tensor, @@ -957,21 +931,6 @@ def cutlass_fp4_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor, sf_offsets) -# aqlm -def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor, - codebooks: torch.Tensor, scales: torch.Tensor, - codebook_partition_sizes: list[int], - bias: Optional[torch.Tensor]) -> torch.Tensor: - return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales, - codebook_partition_sizes, bias) - - -def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor, - codebook_partition_sizes: list[int]) -> torch.Tensor: - return torch.ops._C.aqlm_dequant(codes, codebooks, - codebook_partition_sizes) - - # gptq_marlin def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor, size_k: int, size_n: int, diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 75391c51f7..671ad9eed2 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -692,8 +692,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear): param_data = param.data output_dim = getattr(param, "output_dim", None) - # Special case for AQLM codebooks. - is_metadata = getattr(param, "is_metadata", False) # Special case for per-tensor scale to load scalar into fused array. needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False) @@ -781,13 +779,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear): if not is_sharded_weight: loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) - # Special case for AQLM codebooks. - elif is_metadata: - # metadata indicates fixed size concatenated along dim 0 - shard_size = loaded_weight.shape[0] - shard_offset = loaded_shard_id * shard_size - param_data = param_data.narrow(0, shard_offset, shard_size) - # Special case for per-tensor scales in fused case. elif needs_scalar_to_array: param_data, loaded_weight = adjust_scalar_to_fused_array( @@ -1081,8 +1072,6 @@ class QKVParallelLinear(ColumnParallelLinear): param_data = param.data output_dim = getattr(param, "output_dim", None) - # Special case for AQLM codebooks. - is_metadata = getattr(param, "is_metadata", False) # Special case for per-tensor scales in fused case. needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False) @@ -1204,13 +1193,6 @@ class QKVParallelLinear(ColumnParallelLinear): loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) - # Special case for for AQLM codebooks. - elif is_metadata: - # metadata indicates fixed size concatenated along dim 0 - shard_size = loaded_weight.shape[0] - shard_index = ["q", "k", "v"].index(loaded_shard_id) - param_data = param_data.narrow(0, shard_index * shard_size, - shard_size) # Special case for per-tensor scales in fused case. elif needs_scalar_to_array: param_data, loaded_weight = adjust_scalar_to_fused_array( diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index 8d63027e18..a4c2671225 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -7,7 +7,6 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) QuantizationMethods = Literal[ - "aqlm", "awq", "deepspeedfp", "tpu_int8", @@ -88,7 +87,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: # lazy import to avoid triggering `torch.compile` too early from vllm.model_executor.layers.quantization.quark.quark import QuarkConfig - from .aqlm import AQLMConfig from .auto_round import AutoRoundConfig from .awq import AWQConfig from .awq_marlin import AWQMarlinConfig @@ -120,7 +118,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: from .tpu_int8 import Int8TpuConfig method_to_config: dict[str, type[QuantizationConfig]] = { - "aqlm": AQLMConfig, "awq": AWQConfig, "deepspeedfp": DeepSpeedFPConfig, "tpu_int8": Int8TpuConfig, diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py deleted file mode 100644 index 2ea8c5dc51..0000000000 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ /dev/null @@ -1,376 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Supports AQLM compression, see https://github.com/Vahe1994/AQLM -# and https://arxiv.org/pdf/2401.06118.pdf - -import math -from typing import Any, Optional - -import torch -import torch.nn.functional as F -from torch.nn.parameter import Parameter - -from vllm import _custom_ops as ops -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase -from vllm.model_executor.layers.quantization import QuantizationMethods -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) -from vllm.model_executor.utils import set_weight_attrs - - -def get_int_dtype(nbits: int) -> torch.dtype: - if nbits <= 8: - return torch.int8 - if nbits <= 16: - return torch.int16 - if nbits <= 32: - return torch.int32 - if nbits <= 64: - return torch.int64 - raise ValueError(f"No dtype available for {nbits}-bit codebooks") - - -@torch.inference_mode() -def unpack_int_data(data: torch.IntTensor, nbits: int) -> torch.IntTensor: - return data.to(torch.int64) % (2**nbits) - - -def dequantize_weight(codes: torch.Tensor, - codebooks: torch.Tensor, - scales: Optional[torch.Tensor] = None) -> torch.Tensor: - """ - Decode float weights from quantization codes. Differentiable. - :param codes: tensor of integer quantization codes, shape - [*dims, num_out_groups, num_in_groups, num_codebooks] - :param codebooks: tensor of vectors for each quantization code, - [num_codebooks, codebook_size, out_group_size, in_group_size] - :param scales: weight will be multiplied by this factor, must be - broadcastble with - [*dims, out_groups, num_in_groups, out_group_size, in_group_size] - :return: reconstructed weight tensor of shape - [*dims, num_in_groups*group_size] - """ - num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:] - num_codebooks, codebook_size, out_group_size, in_group_size = \ - codebooks.shape - out_features = num_out_groups * out_group_size - in_features = num_in_groups * in_group_size - codebook_offsets = torch.arange( - 0, num_codebooks * codebook_size, codebook_size, - device=codes.device) # shape: [num_codebooks] - reconstructed_weight_flat = F.embedding_bag( - codes.flatten(0, -2) + codebook_offsets, - codebooks.flatten(0, 1).flatten(-2, -1), - mode="sum" - ) # [prod(dims) * num_out_groups * num_in_groups, out_group_size - # * in_group_size] - - reconstructed_weight_groupwise = reconstructed_weight_flat.view( - list(codes.shape[:-3]) + - [num_out_groups, num_in_groups, out_group_size, in_group_size]) - if scales is not None: - reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul( - scales) - return reconstructed_weight_groupwise.swapaxes( - -3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features]) - - -def dequantize_gemm( - input: torch.Tensor, # [..., in_features] - codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] - codebooks: torch. - Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] - scales: torch.Tensor, # [num_out_groups, 1, 1, 1] - bias: Optional[torch.Tensor], -) -> torch.Tensor: - dequantized_weight = dequantize_weight( - unpack_int_data(codes, codebooks.shape[1].bit_length() - 1), - codebooks, - scales, - ) - return F.linear(input, dequantized_weight, bias) - - -# Generic dequantization, slow but flexible. -def generic_dequantize_gemm( - input: torch.Tensor, # [..., in_features] - codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] - codebooks: torch. - Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] - scales: torch.Tensor, # [num_out_groups, 1, 1, 1] - output_partition_sizes: list[int], - bias: Optional[torch.Tensor], -) -> torch.Tensor: - output_shape = input.shape[:-1] + (scales.shape[0], ) - output = torch.empty(output_shape, dtype=input.dtype, device=input.device) - num_outputs = len(output_partition_sizes) - - # break the inputs and codebooks apart then combine the outputs. - # Surprisingly (to me) this is faster than doing 3 de-quants and 1 big - # multiply at the end. - num_codebooks = codebooks.shape[0] // num_outputs - assert (scales.shape[0] == codes.shape[0]) - assert (sum(output_partition_sizes) == scales.shape[0]) - output_offset = 0 - codebooks_offset = 0 - for output_size in output_partition_sizes: - shard_output = dequantize_gemm( - input, codes.narrow(0, output_offset, output_size), - codebooks.narrow(0, codebooks_offset, num_codebooks), - scales.narrow(0, output_offset, output_size), None - if bias is None else bias.narrow(0, output_offset, output_size)) - - output_slice = output.narrow(-1, output_offset, output_size) - assert (output_slice.shape == shard_output.shape) - output_slice.copy_(shard_output) - output_offset += output_size - codebooks_offset += num_codebooks - return output - - -# Optimized dequnantize/decompression kernels, supports 1x16 and 2x8 -# at 6 and 9 times faster than the generic version above, respectively. -def optimized_dequantize_gemm( - input: torch.Tensor, # [..., in_features] - codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] - codebooks: torch. - Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] - scales: torch.Tensor, # [num_out_groups, 1, 1, 1] - output_partition_sizes: list[int], - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - if bias is None: - # scaling the output is fastest, so we do that when possible. - output = F.linear(input, weights, bias) - orig_shape = output.shape - flattened_output = output.view(-1, output.size(-1)) - f_scales = scales.view(-1, scales.shape[0]) - b_scales = f_scales.expand(flattened_output.shape[0], -1) - flattened_output *= b_scales - return output.view(orig_shape) - else: - b_scales = scales.view(scales.shape[:-3] + (-1, )).expand( - -1, weights.shape[1]) - weights *= b_scales - return F.linear(input, weights, bias) - - -class AQLMConfig(QuantizationConfig): - """Config class for AQLM. - - Reference: https://github.com/Vahe1994/AQLM - """ - - def __init__( - self, - in_group_size: int, - nbits_per_codebook: int, - num_codebooks: int, - out_group_size: int, - ) -> None: - super().__init__() - self.in_group_size = in_group_size - self.nbits_per_codebook = nbits_per_codebook - self.num_codebooks = num_codebooks - self.out_group_size = out_group_size - - # out_group_size > 1 is untested, and probably won't work as-is. - assert (self.out_group_size == 1) - self.pack_factor = (self.in_group_size * self.out_group_size) - - def __repr__(self) -> str: - return (f"AQLMConfig(in_group_size={self.in_group_size}, " - f"nbits_per_codebook={self.nbits_per_codebook}, " - f"num_codebooks={self.num_codebooks}, " - f"out_group_size={self.out_group_size})") - - @classmethod - def get_name(cls) -> QuantizationMethods: - return "aqlm" - - @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: - return [torch.half] - - @classmethod - def get_min_capability(cls) -> int: - return 60 - - @classmethod - def get_config_filenames(cls) -> list[str]: - return [] # no extra configs. - - @classmethod - def from_config(cls, config: dict[str, Any]) -> "AQLMConfig": - in_group_size = cls.get_from_keys(config, ["in_group_size"]) - nbits_per_codebook = cls.get_from_keys(config, ["nbits_per_codebook"]) - num_code_books = cls.get_from_keys(config, ["num_codebooks"]) - out_group_size = cls.get_from_keys(config, ["out_group_size"]) - return cls(in_group_size, nbits_per_codebook, num_code_books, - out_group_size) - - def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["AQLMLinearMethod"]: - if isinstance(layer, LinearBase): - return AQLMLinearMethod(self) - return None - - -class AQLMLinearMethod(LinearMethodBase): - """Linear method for AQLM. - - Args: - quant_config: The AQLM quantization config. - """ - - def __init__(self, quant_config: AQLMConfig): - self.quant_config = quant_config - - def create_weights(self, layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: list[int], input_size: int, - output_size: int, params_dtype: torch.dtype, - **extra_weight_attrs): - del output_size # Unused. - del input_size # Unused. - - if params_dtype != torch.half: - raise ValueError("Only half is currently supported by aqlm") - if input_size_per_partition % self.quant_config.in_group_size != 0: - raise ValueError( - "The input size is not aligned with the quantized " - "weight shape. This can be caused by too large " - "tensor parallel size.") - - output_size_per_partition = sum(output_partition_sizes) - if output_size_per_partition % self.quant_config.out_group_size != 0: - raise ValueError( - "The output size is not aligned with the quantized " - "weight shape. This can be caused by too large " - "tensor parallel size.") - - codes = Parameter( - torch.empty( - # There could actually be two pack factors, one along input and - # one along output, but we don't currently support - # out_group_size, and only the one along output needs to be - # marked with "packed_dim" in order for QKVLinear to work. - output_size_per_partition, - input_size_per_partition // self.quant_config.pack_factor, - self.quant_config.num_codebooks, - dtype=get_int_dtype(self.quant_config.nbits_per_codebook), - ), - requires_grad=False, - ) - - set_weight_attrs( - codes, - { - "input_dim": 1, - "output_dim": 0, - "packed_dim": 1, - "pack_factor": self.quant_config.pack_factor, - }, - ) - - codebooks = Parameter( - torch.empty( - self.quant_config.num_codebooks * len(output_partition_sizes), - 2**self.quant_config.nbits_per_codebook, - self.quant_config.out_group_size, - self.quant_config.in_group_size, - dtype=params_dtype, - ), - requires_grad=False, - ) - set_weight_attrs( - codebooks, - { - # metadata indicates fixed size concatenated along dim 0 - "is_metadata": True, - "output_partition_sizes": output_partition_sizes - }, - ) - - scales = Parameter( - torch.empty( - ( - output_size_per_partition // - self.quant_config.out_group_size, - 1, - 1, - 1, - ), - dtype=params_dtype, - ), - requires_grad=False, - ) - set_weight_attrs( - scales, - { - "output_dim": 0, - "packed_dim": 0, - "pack_factor": self.quant_config.out_group_size - }, - ) - - layer.register_parameter("codes", codes) - set_weight_attrs(codes, extra_weight_attrs) - layer.register_parameter("codebooks", codebooks) - set_weight_attrs(codebooks, extra_weight_attrs) - layer.register_parameter("scales", scales) - set_weight_attrs(scales, extra_weight_attrs) - - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - codebooks = layer.codebooks - codes = layer.codes - scales = layer.scales - output_partition_sizes = getattr(codebooks, "output_partition_sizes", - []) - - nbooks = codes.shape[2] - ingroups = codebooks.shape[3] - outgroups = codebooks.shape[2] - bits = codebooks.shape[1] - - # We support these formats with dedicated gemm and decompression - # kernels. - if ingroups == 8 and outgroups == 1 and ( - (bits == 256 and nbooks == 2) or (bits == 65536 and nbooks == 1)): - - # thresholds determined by timings on an A6000, one GPU - use_gemv = math.prod(x.shape[:-1]) <= 6 - - return ops.aqlm_gemm( - x, - codes, - codebooks, - scales, - output_partition_sizes, - bias, - ) if use_gemv else optimized_dequantize_gemm( - x, - codes, - codebooks, - scales, - output_partition_sizes, - bias, - ) - - # fall back all unoptimized formats - return generic_dequantize_gemm( - x, - codes, - codebooks, - scales, - output_partition_sizes, - bias, - ) From bf7f470b22e8bf26e1edb30b3bf465ab7dd69f0c Mon Sep 17 00:00:00 2001 From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Date: Sat, 16 Aug 2025 15:59:17 -0400 Subject: [PATCH 340/932] [V1] Logits processors extensibility (#19912) Signed-off-by: Andrew Feldman Signed-off-by: Andrew Feldman Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Nick Hill Co-authored-by: Nick Hill Co-authored-by: Andrew Feldman Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 1 + .../offline_inference/logits_processor.py | 147 +++++++++ tests/utils.py | 79 ++++- tests/v1/logits_processors/__init__.py | 0 .../test_correctness.py} | 24 +- .../logits_processors/test_custom_offline.py | 237 ++++++++++++++ .../logits_processors/test_custom_online.py | 180 +++++++++++ tests/v1/logits_processors/utils.py | 127 ++++++++ tests/v1/sample/test_rejection_sampler.py | 4 +- tests/v1/sample/test_sampler.py | 4 +- tests/v1/worker/test_gpu_input_batch.py | 4 +- vllm/config/__init__.py | 5 + vllm/engine/arg_utils.py | 8 + vllm/entrypoints/llm.py | 4 + vllm/utils/__init__.py | 2 +- vllm/v1/sample/logits_processor/__init__.py | 185 +++++++++++ .../builtin.py} | 294 ++---------------- vllm/v1/sample/logits_processor/interface.py | 86 +++++ vllm/v1/sample/logits_processor/state.py | 149 +++++++++ vllm/v1/sample/metadata.py | 4 +- vllm/v1/worker/gpu_input_batch.py | 91 ++++-- vllm/v1/worker/gpu_model_runner.py | 11 +- 22 files changed, 1312 insertions(+), 334 deletions(-) create mode 100644 examples/offline_inference/logits_processor.py create mode 100644 tests/v1/logits_processors/__init__.py rename tests/v1/{sample/test_logits_processors.py => logits_processors/test_correctness.py} (97%) create mode 100644 tests/v1/logits_processors/test_custom_offline.py create mode 100644 tests/v1/logits_processors/test_custom_online.py create mode 100644 tests/v1/logits_processors/utils.py create mode 100644 vllm/v1/sample/logits_processor/__init__.py rename vllm/v1/sample/{logits_processor.py => logits_processor/builtin.py} (54%) create mode 100644 vllm/v1/sample/logits_processor/interface.py create mode 100644 vllm/v1/sample/logits_processor/state.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 87296a08e2..4fc8857854 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -253,6 +253,7 @@ steps: - pytest -v -s v1/engine - pytest -v -s v1/entrypoints - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors - pytest -v -s v1/worker - pytest -v -s v1/structured_output - pytest -v -s v1/spec_decode diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor.py new file mode 100644 index 0000000000..7ef20efa7d --- /dev/null +++ b/examples/offline_inference/logits_processor.py @@ -0,0 +1,147 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""This example demonstrates instantiating vLLM with a custom logits processor +class object. + +For a basic example of implementing a custom logits processor, see +the `DummyLogitsProcessor` implementation in `vllm/test_utils.py`. + +For testing purposes, a dummy logits processor is employed which, if +`target_token` is passed as a keyword argument to `SamplingParams.extra_args`, +will mask out all tokens except `target_token`. + +A batch is constructed with `temperature=0.0` and 50% of requests specifying +`target_token`, and for these requests - and *only* these requests - we +expect the `target_token` to be decoded in each step, yielding an output +similar to that shown below: + +Generated Outputs: +------------------------------------------------------------ +Prompt: 'Hello, my name is' +Output: " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '" +------------------------------------------------------------ +Prompt: 'The president of the United States is' +Output: " not a racist. He is a racist.\nHe's a racist because he" +------------------------------------------------------------ +Prompt: 'The capital of France is' +Output: ' also also also also also also also also also also also also also + also also also' +------------------------------------------------------------ +Prompt: 'The future of AI is' +Output: ' in the hands of the people.\n\nThe future of AI is in the' +------------------------------------------------------------ +""" + +from typing import Optional + +import torch + +from vllm import LLM, SamplingParams +from vllm.config import VllmConfig +from vllm.v1.sample.logits_processor import ( + BatchUpdate, + LogitsProcessor, + MoveDirectionality, +) + + +# Hypothetical custom logits processor +class DummyLogitsProcessor(LogitsProcessor): + """Fake logit processor to support unit testing and examples""" + + def __init__( + self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool + ): + self.req_info: dict[int, SamplingParams] = {} + + def is_argmax_invariant(self) -> bool: + """Never impacts greedy sampling""" + return False + + def update_state(self, batch_update: Optional[BatchUpdate]): + if not batch_update: + return + + # Process added requests. + for index, params, _, _ in batch_update.added: + assert params is not None + if params.extra_args and ( + target_token := params.extra_args.get("target_token") + ): + self.req_info[index] = target_token + + if self.req_info: + # Process removed requests. + for index in batch_update.removed: + self.req_info.pop(index, None) + + # Process moved requests, unidirectional move (a->b) and swap + # (a<->b) + for adx, bdx, direct in batch_update.moved: + a_val = self.req_info.pop(adx, None) + b_val = self.req_info.pop(bdx, None) + if a_val is not None: + self.req_info[bdx] = a_val + if direct == MoveDirectionality.SWAP and b_val is not None: + self.req_info[adx] = b_val + + def apply(self, logits: torch.Tensor) -> torch.Tensor: + if not self.req_info: + return logits + + # Save target values before modification + rows_list = list(self.req_info.keys()) + cols = torch.tensor( + [self.req_info[i] for i in rows_list], + dtype=torch.long, + device=logits.device, + ) + rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device) + values_to_keep = logits[rows, cols].clone() + + # Mask all but target tokens + logits[rows] = float("-inf") + logits[rows, cols] = values_to_keep + + return logits + + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a mixture of requests which do and don't utilize the dummy logitproc +sampling_params_list = [ + SamplingParams(temperature=0.0, extra_args={"target_token": 128}), + SamplingParams(temperature=0.0), + SamplingParams(temperature=0.0, extra_args={"target_token": 67}), + SamplingParams(temperature=0.0), +] + + +def main(): + # Create an LLM. + llm = LLM( + model="facebook/opt-125m", + logits_processors=[DummyLogitsProcessor], + ) + # Generate texts from the prompts. + # The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params_list) + # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}") + print(f"Output: {generated_text!r}") + print("-" * 60) + + +if __name__ == "__main__": + main() diff --git a/tests/utils.py b/tests/utils.py index 18fcde9491..e98707fb44 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -13,6 +13,7 @@ import tempfile import time import warnings from contextlib import contextmanager, suppress +from multiprocessing import Process from pathlib import Path from typing import Any, Callable, Literal, Optional, Union @@ -76,6 +77,23 @@ VLLM_PATH = Path(__file__).parent.parent class RemoteOpenAIServer: DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key + def _start_server(self, model: str, vllm_serve_args: list[str], + env_dict: Optional[dict[str, str]]) -> None: + """Subclasses override this method to customize server process launch + """ + env = os.environ.copy() + # the current process might initialize cuda, + # to be safe, we should use spawn method + env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' + if env_dict is not None: + env.update(env_dict) + self.proc: subprocess.Popen = subprocess.Popen( + ["vllm", "serve", model, *vllm_serve_args], + env=env, + stdout=sys.stdout, + stderr=sys.stderr, + ) + def __init__(self, model: str, vllm_serve_args: list[str], @@ -128,18 +146,7 @@ class RemoteOpenAIServer: model_loader = get_model_loader(load_config) model_loader.download_model(model_config) - env = os.environ.copy() - # the current process might initialize cuda, - # to be safe, we should use spawn method - env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' - if env_dict is not None: - env.update(env_dict) - self.proc = subprocess.Popen( - ["vllm", "serve", model, *vllm_serve_args], - env=env, - stdout=sys.stdout, - stderr=sys.stderr, - ) + self._start_server(model, vllm_serve_args, env_dict) max_wait_seconds = max_wait_seconds or 240 self._wait_for_server(url=self.url_for("health"), timeout=max_wait_seconds) @@ -155,6 +162,10 @@ class RemoteOpenAIServer: # force kill if needed self.proc.kill() + def _poll(self) -> Optional[int]: + """Subclasses override this method to customize process polling""" + return self.proc.poll() + def _wait_for_server(self, *, url: str, timeout: float): # run health check start = time.time() @@ -169,7 +180,7 @@ class RemoteOpenAIServer: # which means the server is not ready yet. # the stack trace is not useful, so we suppress it # by using `raise from None`. - result = self.proc.poll() + result = self._poll() if result is not None and result != 0: raise RuntimeError("Server exited unexpectedly.") from None @@ -205,6 +216,48 @@ class RemoteOpenAIServer: **kwargs) +class RemoteOpenAIServerCustom(RemoteOpenAIServer): + """Launch test server with custom child process""" + + def _start_server(self, model: str, vllm_serve_args: list[str], + env_dict: Optional[dict[str, str]]) -> None: + self.proc: Process = Process( + target=self.child_process_fxn, + args=(env_dict, model, + vllm_serve_args)) # type: ignore[assignment] + self.proc.start() + + def __init__(self, + model: str, + vllm_serve_args: list[str], + child_process_fxn: Callable[ + [Optional[dict[str, str]], str, list[str]], None], + *, + env_dict: Optional[dict[str, str]] = None, + seed: Optional[int] = 0, + auto_port: bool = True, + max_wait_seconds: Optional[float] = None) -> None: + """Store custom child process function then invoke superclass + constructor which will indirectly launch it.""" + self.child_process_fxn = child_process_fxn + super().__init__(model=model, + vllm_serve_args=vllm_serve_args, + env_dict=env_dict, + seed=seed, + auto_port=auto_port, + max_wait_seconds=max_wait_seconds) + + def _poll(self) -> Optional[int]: + return self.proc.exitcode + + def __exit__(self, exc_type, exc_value, traceback): + self.proc.terminate() + self.proc.join(8) + if self.proc.is_alive(): + # force kill if needed + self.proc.kill() + + def _test_completion( client: openai.OpenAI, model: str, diff --git a/tests/v1/logits_processors/__init__.py b/tests/v1/logits_processors/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/logits_processors/test_correctness.py similarity index 97% rename from tests/v1/sample/test_logits_processors.py rename to tests/v1/logits_processors/test_correctness.py index 84ee3b0392..43caef79b0 100644 --- a/tests/v1/sample/test_logits_processors.py +++ b/tests/v1/logits_processors/test_correctness.py @@ -9,11 +9,13 @@ import numpy as np import pytest import torch +from tests.utils import create_new_process_for_each_test from tests.v1.sample.utils import (LogitsprocsTestFakes, create_fake_logits, create_penalty_tensor, create_prompt_tokens_tensor, fake_apply_logitsprocs, fake_update_logitsprocs_state) +from vllm.config import VllmConfig from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams from vllm.utils import is_pin_memory_available @@ -24,7 +26,7 @@ from vllm.v1.sample.logits_processor import (BatchUpdate, BatchUpdateBuilder, MinPLogitsProcessor, MinTokensLogitsProcessor, MoveDirectionality, - init_builtin_logitsprocs) + build_logitsprocs) # yapf: enable from vllm.v1.sample.metadata import SamplingMetadata @@ -53,6 +55,7 @@ class LogitsProcsRequestParams: workload_index: int logitproc_type: LogitprocType # Logitproc enabled, specified by str id out_tokens: list[int] # Output tokens required for min tokens test + prompt_tokens: list[int] # Dummy prompt tokens placeholder params: SamplingParams # Settings customized for logitproc def __init__(self, workload_index: int, logitproc_type: LogitprocType): @@ -63,6 +66,7 @@ class LogitsProcsRequestParams: # don't matter *for these tests* so use 0 as a dummy value self.out_tokens = ([0] * (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 2))) + self.prompt_tokens = [] self.params = _sampling_params_from_logitproc(logitproc_type) def __str__(self): @@ -88,11 +92,12 @@ def _generate_fake_sampling_metadata( vocab_size, size=np.random.randint( 1, MAX_NUM_PROMPT_TOKENS)).tolist()) - logitsprocs = init_builtin_logitsprocs( - pin_memory_available=PIN_MEMORY_AVAILABLE, - max_num_reqs=MAX_NUM_REQS + 1, - device=device) - + logitsprocs = build_logitsprocs( + vllm_config=VllmConfig(), + device=device, + is_pin_memory=PIN_MEMORY_AVAILABLE, + is_pooling_model=False, + ) fake_sampling_metadata = SamplingMetadata( temperature=torch.full((batch_size, ), 0.0), all_greedy=True, @@ -462,7 +467,8 @@ def _generate_fake_step_update( # Replace as many removed requests as possible with added requests add_remove_idx = batch_update_builder.pop_removed() batch_update_builder.added.append( - (add_remove_idx, add_req_params.params, add_req_params.out_tokens)) + (add_remove_idx, add_req_params.params, + add_req_params.prompt_tokens, add_req_params.out_tokens)) persistent_batch[add_remove_idx] = add_req_params # Append remaining added requests to end of batch @@ -470,7 +476,8 @@ def _generate_fake_step_update( num_step_add_replace):(wdx + num_step_add)] batch_update_builder.added.extend([ - (adx + batch_size, add_req_params.params, add_req_params.out_tokens) + (adx + batch_size, add_req_params.params, add_req_params.prompt_tokens, + add_req_params.out_tokens) for adx, add_req_params in enumerate(add_reqs_append) ]) persistent_batch.extend(add_reqs_append) @@ -561,6 +568,7 @@ def _assert_valid( step_idx=step_idx) +@create_new_process_for_each_test() @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("reqs_per_logitproc", [REQS_PER_LOGITPROC]) @pytest.mark.parametrize("logitsprocs_under_test", _get_test_cases()) diff --git a/tests/v1/logits_processors/test_custom_offline.py b/tests/v1/logits_processors/test_custom_offline.py new file mode 100644 index 0000000000..a7fde1990f --- /dev/null +++ b/tests/v1/logits_processors/test_custom_offline.py @@ -0,0 +1,237 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import random +import sys +from typing import Union + +import pytest + +from tests.utils import create_new_process_for_each_test +# yapf: disable +from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG, + DUMMY_LOGITPROC_FQCN, + DUMMY_LOGITPROC_MODULE, + MAX_TOKENS, MODEL_NAME, + POOLING_MODEL_NAME, TEMP_GREEDY, + CustomLogitprocSource, + DummyLogitsProcessor, + dummy_module) +from tests.v1.logits_processors.utils import entry_points as fake_entry_points +from tests.v1.logits_processors.utils import prompts +# yapf: enable +from vllm import LLM, SamplingParams +from vllm.v1.sample.logits_processor import (STR_POOLING_REJECTS_LOGITSPROCS, + LogitsProcessor) + +# Create a mixture of requests which do and don't utilize the dummy logitproc +sampling_params_list = [ + SamplingParams(temperature=TEMP_GREEDY, + max_tokens=MAX_TOKENS, + extra_args={DUMMY_LOGITPROC_ARG: 128}), + SamplingParams(temperature=TEMP_GREEDY, max_tokens=MAX_TOKENS), + SamplingParams(temperature=TEMP_GREEDY, + max_tokens=MAX_TOKENS, + extra_args={DUMMY_LOGITPROC_ARG: 67}), + SamplingParams(temperature=TEMP_GREEDY, max_tokens=MAX_TOKENS), +] + + +def _run_test(kwargs: dict, logitproc_loaded: bool) -> None: + """Compare `LLM` instance initialized with specified `kwargs` against + reference `LLM` instance. + + Two scenarios: + 1. Server has loaded dummy logitproc; test that requests which specify + dummy logitproc arg value behave as if logitproc is operating (output + token value should repeat), while requests that don't specify dummy + logitproc arg value should match reference `LLM` output. + 2. Server has *not* loaded dummy logitproc; test that all requests + behave as if logitproc is *not* operating (output matches reference + `LLM` output.) + + Args: + kwargs: `LLM` constructor kwargs + logitproc_loaded: server has loaded dummy logitproc if True + """ + + # Create a vLLM instance and load custom logitproc + llm_logitproc = LLM( + model=MODEL_NAME, + gpu_memory_utilization=0.1, + **kwargs, + ) + + # Create a reference vLLM instance without custom logitproc + llm_ref = LLM(model=MODEL_NAME, gpu_memory_utilization=0.1) + + # Run inference with logitproc loaded + outputs_logitproc = llm_logitproc.generate(prompts, sampling_params_list) + + # Reference run + outputs_ref = llm_ref.generate(prompts, sampling_params_list) + + # Validate outputs + for bdx, (out_lp, out_ref, params) in enumerate( + zip(outputs_logitproc, outputs_ref, sampling_params_list)): + lp_toks = out_lp.outputs[0].token_ids + if logitproc_loaded and params.extra_args: + # This request exercises custom logitproc; validate that logitproc + # forces `target_token` to be decoded in each step + target_token = params.extra_args[DUMMY_LOGITPROC_ARG] + if not all(x == target_token for x in lp_toks): + raise AssertionError( + f"Request {bdx} generated {lp_toks}, shoud all be " + f"{target_token}") + else: + # This request does not exercise custom logitproc (or custom + # logitproc is not enabled on this server); validate against + # reference result + ref_toks = out_ref.outputs[0].token_ids + if lp_toks != ref_toks: + raise AssertionError( + f"Request {bdx} generated {lp_toks}, should match " + f"{ref_toks}") + + +@create_new_process_for_each_test() +@pytest.mark.parametrize("logitproc_source", list(CustomLogitprocSource)) +def test_custom_logitsprocs(monkeypatch, + logitproc_source: CustomLogitprocSource): + """Test offline Python interface for passing custom logitsprocs + + Construct an `LLM` instance which loads a custom logitproc that has a + well-defined behavior (mask out all tokens except one `target_token`) + + Construct a reference `LLM` instance with no custom logitproc + + Pass in a batch of requests, 50% of which pass a `target_token` value + in through `SamplingParams.extra_args`, 50% of which do not. + + Validate that + * Requests which do not activate the custom logitproc, yield the same + results for both `LLM` instances + * Requests which activate the custom logitproc, only output `target_token` + + Test four scenarios, corresponding to `logitproc_source` value + * No logitsprocs loaded - test that generated tokens match reference `LLM` + instance output + * Logitproc passed in via {entrypoint, class object, fully-qualified class + name (FQCN)} - test that dummy logitproc is utilized correctly when + provided via any of these three possible sources + + Args: + monkeypatch: for setting env vars + logitproc_source: what source (entrypoint, fully-qualified class name + (FQCN), class object, or None) the user pulls the + logitproc from + """ + + # Test that logitproc info is passed to workers + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1") + random.seed(40) + + # Choose LLM args based on logitproc source + if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_NONE: + # Scenario: the server does not load any custom logitproc + # Every other scenario is a different way of loading a custom logitproc + _run_test({}, logitproc_loaded=False) + return + + if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT: + # Scenario: vLLM loads a logitproc from a preconfigured entrypoint + # To that end, mock a dummy logitproc entrypoint + import importlib.metadata + importlib.metadata.entry_points = fake_entry_points # type: ignore + + # fork is required for workers to see entrypoint patch + monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork") + _run_test({}, logitproc_loaded=True) + return + + kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {} + if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN: + # Scenario: load logitproc based on fully-qualified class name (FQCN) + # Inject dummy module which defines logitproc + sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module + kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN] + elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS: + # Scenario: load logitproc from provided class object + kwargs["logits_processors"] = [DummyLogitsProcessor] + + _run_test(kwargs, logitproc_loaded=True) + + +@create_new_process_for_each_test() +@pytest.mark.parametrize("logitproc_source", [ + CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT, + CustomLogitprocSource.LOGITPROC_SOURCE_FQCN, + CustomLogitprocSource.LOGITPROC_SOURCE_CLASS, +]) +def test_pooling_rejects_custom_logitsprocs( + monkeypatch, logitproc_source: CustomLogitprocSource): + """Validate that vLLM engine initialization properly rejects custom + logitsprocs when the model is a pooling model. + + Use `LLM` entrypoint. We expect `LLM` initialization to fail before the + logitproc is actually loaded. + + Scenario 1: + * Mock a logitproc entrypoint + * Validate that `LLM` does not load the logitproc + + Scenario 2: + * Pass custom logitproc to `LLM` constructor + * Scenario 2a: via FQCN + * Scenario 2b: via class object + * Validate that initialization fails with appropriate exception + + Args: + monkeypatch: used to set environment variables + logitproc_source: what source (entrypoint, fully-qualified class name + (FQCN), or class object) the user pulls the + logitproc from + """ + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") + random.seed(40) + + if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT: + # Scenario: vLLM loads a pooling model and ignores a logitproc that is + # available at a preconfigured entrypoint + + # Patch in dummy logitproc entrypoint + import importlib.metadata + importlib.metadata.entry_points = fake_entry_points # type: ignore + + # fork is required for entrypoint patch to be visible to workers, + # although they should ignore the entrypoint patch anyway + monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork") + + llm = LLM( + runner="pooling", + model=POOLING_MODEL_NAME, + gpu_memory_utilization=0.1, + ) + # Require that no logitsprocs have been loaded + assert sum([ + 1 for _ in llm.llm_engine.model_executor.driver_worker.worker. + model_runner.input_batch.logitsprocs.all + ]) == 0 + return + + kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {} + if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN: + # Scenario: load logitproc based on fully-qualified class name (FQCN) + kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN] + elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS: + # Scenario: load logitproc from provided class object + kwargs["logits_processors"] = [DummyLogitsProcessor] + + with pytest.raises(ValueError, match=STR_POOLING_REJECTS_LOGITSPROCS): + # Require that loading a pooling model alongside the logitproc raises + # the appropriate exception. + LLM( + runner="pooling", + model=POOLING_MODEL_NAME, + gpu_memory_utilization=0.1, + **kwargs, + ) diff --git a/tests/v1/logits_processors/test_custom_online.py b/tests/v1/logits_processors/test_custom_online.py new file mode 100644 index 0000000000..a01a479e5b --- /dev/null +++ b/tests/v1/logits_processors/test_custom_online.py @@ -0,0 +1,180 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +import random +import sys +from typing import Any, Optional + +import openai +import pytest +import pytest_asyncio + +from tests.utils import (RemoteOpenAIServerCustom, + create_new_process_for_each_test) +# yapf: disable +from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG, + DUMMY_LOGITPROC_FQCN, + DUMMY_LOGITPROC_MODULE, + MAX_TOKENS, MODEL_NAME, + TEMP_GREEDY, dummy_module) +from tests.v1.logits_processors.utils import entry_points as fake_entry_points +from tests.v1.logits_processors.utils import prompts + +# yapf: enable + + +def _server_with_logitproc_entrypoint( + env_dict: Optional[dict[str, str]], + model: str, + vllm_serve_args: list[str], +) -> None: + """Start vLLM server, inject dummy logitproc entrypoint""" + + # Patch `entry_points` to inject logitproc entrypoint + import importlib.metadata + importlib.metadata.entry_points = fake_entry_points # type: ignore + from vllm.entrypoints.cli import main + + # fork is required for workers to see entrypoint patch + os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "fork" + if env_dict is not None: + os.environ.update(env_dict) + + # Emulate `vllm serve ` + sys.argv = ["vllm", "serve", model] + vllm_serve_args + main.main() + + +def _server_with_logitproc_module( + env_dict: Optional[dict[str, str]], + model: str, + vllm_serve_args: list[str], +) -> None: + """Start vLLM server, inject module with dummy logitproc""" + + # Patch `modules` to inject dummy logitproc module + from vllm.entrypoints.cli import main + sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module + + # fork is required for workers to see entrypoint patch + os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "fork" + if env_dict is not None: + os.environ.update(env_dict) + + # Emulate `vllm serve ` + sys.argv = ["vllm", "serve", model] + vllm_serve_args + main.main() + + +@pytest.fixture(scope="module") +def default_server_args(): + return [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "2048", + "--max-num-seqs", + "128", + ] + + +@pytest.fixture(scope="function", + params=[[], ["--logits-processors", DUMMY_LOGITPROC_FQCN]]) +def server(default_server_args, request, monkeypatch): + """Consider two server configurations: + (1) --logits-processors cli arg specifies dummy logits processor via fully- + qualified class name (FQCN); patch in a dummy logits processor module + (2) No --logits-processors cli arg; patch in a dummy logits processor + entrypoint + """ + + # Test that logitproc info is passed to workers + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1") + + if request.param: + # Launch server, append FQCN argument, inject dummy logitproc module + args = default_server_args + request.param + _server_fxn = _server_with_logitproc_module + else: + # Launch server, inject dummy logitproc entrypoint + args = default_server_args + _server_fxn = _server_with_logitproc_entrypoint + + with RemoteOpenAIServerCustom(MODEL_NAME, args, + _server_fxn) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +# General request argument values for these tests +api_keyword_args = { + # Greedy sampling ensures that requests which receive the `target_token` + # arg will decode it in every step + "temperature": TEMP_GREEDY, + # Since EOS will never be decoded (unless `target_token` is EOS) + "max_tokens": MAX_TOKENS, + # Return decoded token logprobs (as a way of getting token id) + "logprobs": 0, +} + + +@create_new_process_for_each_test() +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) +async def test_custom_logitsprocs(client: openai.AsyncOpenAI, model_name: str): + """Test custom logitsprocs when starting OpenAI server from CLI + + Launch vLLM OpenAI-compatible server, configured to load a custom logitproc + that has a well-defined behavior (mask out all tokens except one + `target_token`). + + Pass in requests, 50% of which pass a `target_token` value + in through `extra_body["vllm_xargs"]`, 50% of which do not. + + Validate that requests which activate the custom logitproc, repeat the same + token + """ + + use_dummy_logitproc = True + for prompt in prompts: + # Build request arguments + request_keyword_args: dict[str, Any] = { + **api_keyword_args, + } + if use_dummy_logitproc: + # 50% of requests pass target_token custom arg + target_token = random.choice([128, 67]) + # For requests which activate the dummy logitproc, choose one of + # two `target_token` values which are known not to be EOS tokens + request_keyword_args["extra_body"] = { + "vllm_xargs": { + DUMMY_LOGITPROC_ARG: target_token + } + } + batch = await client.completions.create( + model=model_name, + prompt=prompt, + **request_keyword_args, + ) + + if use_dummy_logitproc: + # Only for requests which activate dummy logitproc - validate that + # output token is repeated + choices: openai.types.CompletionChoice = batch.choices + toks = choices[0].logprobs.tokens + if not all([x == toks[0] for x in toks]): + raise AssertionError( + f"Generated {toks} should all be {toks[0]}") + + # Alternate whether to activate dummy logitproc for each request + use_dummy_logitproc = not use_dummy_logitproc diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py new file mode 100644 index 0000000000..c0bfc1a18f --- /dev/null +++ b/tests/v1/logits_processors/utils.py @@ -0,0 +1,127 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import types +from enum import Enum, auto +from typing import Optional + +import torch + +from vllm.config import VllmConfig +from vllm.sampling_params import SamplingParams +from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, BatchUpdate, + LogitsProcessor, + MoveDirectionality) + +MODEL_NAME = "facebook/opt-125m" +POOLING_MODEL_NAME = "BAAI/bge-base-en-v1.5" +DUMMY_LOGITPROC_ARG = "target_token" +TEMP_GREEDY = 0.0 +MAX_TOKENS = 20 +DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc" +DUMMY_LOGITPROC_MODULE = "DummyModule" +DUMMY_LOGITPROC_FQCN = f"{DUMMY_LOGITPROC_MODULE}:DummyLogitsProcessor" + + +class CustomLogitprocSource(Enum): + """How to source a logitproc for testing purposes""" + LOGITPROC_SOURCE_NONE = auto() # No custom logitproc + LOGITPROC_SOURCE_ENTRYPOINT = auto() # Via entrypoint + LOGITPROC_SOURCE_FQCN = auto() # Via fully-qualified class name (FQCN) + LOGITPROC_SOURCE_CLASS = auto() # Via provided class object + + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + + +class DummyLogitsProcessor(LogitsProcessor): + """Fake logit processor to support unit testing and examples""" + + def __init__(self, vllm_config: "VllmConfig", device: torch.device, + is_pin_memory: bool): + self.req_info: dict[int, SamplingParams] = {} + + def is_argmax_invariant(self) -> bool: + """Never impacts greedy sampling""" + return False + + def update_state(self, batch_update: Optional[BatchUpdate]): + if not batch_update: + return + + # Process added requests. + for index, params, _, _ in batch_update.added: + assert params is not None + if params.extra_args and (target_token := + params.extra_args.get("target_token")): + self.req_info[index] = target_token + + if self.req_info: + # Process removed requests. + for index in batch_update.removed: + self.req_info.pop(index, None) + + # Process moved requests, unidirectional move (a->b) and swap + # (a<->b) + for adx, bdx, direct in batch_update.moved: + a_val = self.req_info.pop(adx, None) + b_val = self.req_info.pop(bdx, None) + if a_val is not None: + self.req_info[bdx] = a_val + if direct == MoveDirectionality.SWAP and b_val is not None: + self.req_info[adx] = b_val + + def apply(self, logits: torch.Tensor) -> torch.Tensor: + if not self.req_info: + return logits + + # Save target values before modification + rows_list = list(self.req_info.keys()) + cols = torch.tensor([self.req_info[i] for i in rows_list], + dtype=torch.long, + device=logits.device) + rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device) + values_to_keep = logits[rows, cols].clone() + + # Mask all but target tokens + logits[rows] = float('-inf') + logits[rows, cols] = values_to_keep + + return logits + + +"""Dummy module with dummy logitproc class""" +dummy_module = types.ModuleType(DUMMY_LOGITPROC_MODULE) +dummy_module.DummyLogitsProcessor = DummyLogitsProcessor # type: ignore + + +class EntryPoint: + """Dummy entrypoint class for logitsprocs testing""" + + def __init__(self): + self.name = DUMMY_LOGITPROC_ENTRYPOINT + self.value = DUMMY_LOGITPROC_FQCN + + def load(self): + return DummyLogitsProcessor + + +class EntryPoints(list): + """Dummy EntryPoints class for logitsprocs testing""" + + def __init__(self, group: str): + # Emulate list-like functionality + eps = [EntryPoint()] if group == LOGITSPROCS_GROUP else [] + super().__init__(eps) + # Extra attributes + self.names = [ep.name for ep in eps] + + +"""Fake version of importlib.metadata.entry_points""" +entry_points = lambda group: EntryPoints(group) diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py index 3a4d48afc9..4e912f98f3 100644 --- a/tests/v1/sample/test_rejection_sampler.py +++ b/tests/v1/sample/test_rejection_sampler.py @@ -7,7 +7,7 @@ import torch import torch.nn.functional as F from vllm.platforms import current_platform -from vllm.v1.sample.logits_processor import LogitsProcessorManager +from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID, RejectionSampler) @@ -69,7 +69,7 @@ def create_sampling_metadata( output_token_ids=[], allowed_token_ids_mask=None, bad_words_token_ids={}, - logitsprocs=LogitsProcessorManager(), + logitsprocs=LogitsProcessors(), ) diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py index 31c6c881d7..53215f88bb 100644 --- a/tests/v1/sample/test_sampler.py +++ b/tests/v1/sample/test_sampler.py @@ -9,7 +9,7 @@ import torch from vllm.platforms import current_platform from vllm.utils import is_pin_memory_available, make_tensor_with_pad -from vllm.v1.sample.logits_processor import LogitsProcessorManager +from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.sampler import Sampler @@ -173,7 +173,7 @@ def _create_default_sampling_metadata( no_penalties=True, allowed_token_ids_mask=None, bad_words_token_ids={}, - logitsprocs=LogitsProcessorManager(), + logitsprocs=LogitsProcessors(), ) return fake_sampling_metadata diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 74ab19a3ce..d7b4746562 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -13,7 +13,7 @@ from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams from vllm.utils import is_pin_memory_available, make_tensor_with_pad from vllm.v1.pool.metadata import PoolingMetadata -from vllm.v1.sample.logits_processor import LogitsProcessorManager +from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch @@ -169,7 +169,7 @@ def _construct_expected_sampling_metadata( and all(x == 1 for x in repetition_penalties)), allowed_token_ids_mask=allowed_token_ids_mask, bad_words_token_ids=bad_words_token_ids, - logitsprocs=LogitsProcessorManager(), + logitsprocs=LogitsProcessors(), ) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 14fc5589a8..51db277f65 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -62,6 +62,7 @@ if TYPE_CHECKING: QuantizationConfig) from vllm.model_executor.model_loader import LoadFormats from vllm.model_executor.model_loader.tensorizer import TensorizerConfig + from vllm.v1.sample.logits_processor import LogitsProcessor HfOverrides = Union[dict, Callable[[type], type]] else: @@ -72,6 +73,7 @@ else: BaseModelLoader = Any LoadFormats = Any TensorizerConfig = Any + LogitsProcessor = Any HfOverrides = Union[dict[str, Any], Callable[[type], type]] me_quant = LazyLoader("model_executor", globals(), @@ -465,6 +467,9 @@ class ModelConfig: - "transformers" will use the Transformers model implementation.""" override_attention_dtype: Optional[str] = None """Override dtype for attention""" + logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None + """One or more logits processors' fully-qualified class names or class + definitions""" def compute_hash(self) -> str: """ diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 630fbec453..6fc894827c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -43,6 +43,7 @@ from vllm.transformers_utils.config import is_interleaved from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor) +from vllm.v1.sample.logits_processor import LogitsProcessor # yapf: enable @@ -435,6 +436,10 @@ class EngineArgs: enable_multimodal_encoder_data_parallel: bool = \ ParallelConfig.enable_multimodal_encoder_data_parallel + logits_processors: Optional[list[Union[ + str, type[LogitsProcessor]]]] = ModelConfig.logits_processors + """Custom logitproc types""" + async_scheduling: bool = SchedulerConfig.async_scheduling # DEPRECATED enable_prompt_adapter: bool = False @@ -549,6 +554,8 @@ class EngineArgs: **model_kwargs["model_impl"]) model_group.add_argument("--override-attention-dtype", **model_kwargs["override_attention_dtype"]) + model_group.add_argument("--logits-processors", + **model_kwargs["logits_processors"]) # Model loading arguments load_kwargs = get_kwargs(LoadConfig) @@ -940,6 +947,7 @@ class EngineArgs: enable_sleep_mode=self.enable_sleep_mode, model_impl=self.model_impl, override_attention_dtype=self.override_attention_dtype, + logits_processors=self.logits_processors, ) def validate_tensorizer_args(self): diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 915f14a29b..b002f234c0 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -55,6 +55,7 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer, get_cached_tokenizer) from vllm.usage.usage_lib import UsageContext from vllm.utils import Counter, Device, deprecate_kwargs, is_list_of +from vllm.v1.sample.logits_processor import LogitsProcessor if TYPE_CHECKING: from vllm.v1.metrics.reader import Metric @@ -198,6 +199,8 @@ class LLM: override_pooler_config: Optional[PoolerConfig] = None, compilation_config: Optional[Union[int, dict[str, Any], CompilationConfig]] = None, + logits_processors: Optional[list[Union[str, + type[LogitsProcessor]]]] = None, **kwargs, ) -> None: """LLM constructor.""" @@ -272,6 +275,7 @@ class LLM: mm_processor_kwargs=mm_processor_kwargs, override_pooler_config=override_pooler_config, compilation_config=compilation_config_instance, + logits_processors=logits_processors, **kwargs, ) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 64f7426bd6..5cb9f97ae0 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2562,7 +2562,7 @@ def direct_register_custom_op( def resolve_obj_by_qualname(qualname: str) -> Any: """ - Resolve an object by its fully qualified name. + Resolve an object by its fully-qualified class name. """ module_name, obj_name = qualname.rsplit(".", 1) module = importlib.import_module(module_name) diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py new file mode 100644 index 0000000000..8220269162 --- /dev/null +++ b/vllm/v1/sample/logits_processor/__init__.py @@ -0,0 +1,185 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import importlib +import itertools +from collections.abc import Sequence +from typing import TYPE_CHECKING, Optional, Union + +import torch + +from vllm.logger import init_logger +from vllm.v1.sample.logits_processor.builtin import (LogitBiasLogitsProcessor, + MinPLogitsProcessor, + MinTokensLogitsProcessor) +from vllm.v1.sample.logits_processor.interface import (BatchUpdate, + LogitsProcessor, + MoveDirectionality) +from vllm.v1.sample.logits_processor.state import (BatchUpdateBuilder, + LogitsProcessors) + +if TYPE_CHECKING: + from vllm.config import VllmConfig + +logger = init_logger(__name__) + +# Error message when the user tries to initialize vLLM with a pooling model +# and custom logitsproces +STR_POOLING_REJECTS_LOGITSPROCS = ("Pooling models do not support custom" + " logits processors.") + +LOGITSPROCS_GROUP = 'vllm.logits_processors' + +BUILTIN_LOGITS_PROCESSORS: list[type[LogitsProcessor]] = [ + MinTokensLogitsProcessor, + LogitBiasLogitsProcessor, + MinPLogitsProcessor, +] + + +def _load_logitsprocs_plugins() -> list[type[LogitsProcessor]]: + """Load all installed logit processor plugins""" + + import sys + + if sys.version_info < (3, 10): + from importlib_metadata import entry_points + else: + from importlib.metadata import entry_points + + installed_logitsprocs_plugins = entry_points(group=LOGITSPROCS_GROUP) + if len(installed_logitsprocs_plugins) == 0: + logger.debug("No logitsprocs plugins installed (group %s).", + LOGITSPROCS_GROUP) + return [] + + # Load logitsprocs plugins + logger.debug("Loading installed logitsprocs plugins (group %s):", + LOGITSPROCS_GROUP) + classes: list[type[LogitsProcessor]] = [] + for entrypoint in installed_logitsprocs_plugins: + try: + logger.debug("- Loading logitproc plugin entrypoint=%s target=%s", + entrypoint.name, entrypoint.value) + classes.append(entrypoint.load()) + except Exception as e: + raise RuntimeError( + f"Failed to load LogitsProcessor plugin {entrypoint}") from e + return classes + + +def _load_logitsprocs_by_fqcns( + logits_processors: Optional[Sequence[Union[str, type[LogitsProcessor]]]] +) -> list[type[LogitsProcessor]]: + """Load logit processor types, identifying them by fully-qualified class + names (FQCNs). + + Effectively, a mixed list of logitproc types and FQCN strings is converted + into a list of entirely logitproc types, by loading from the FQCNs. + + FQCN syntax is : i.e. x.y.z:CustomLogitProc + + Already-loaded logitproc types must be subclasses of LogitsProcessor + + Args: + logits_processors: Potentially mixed list of logitsprocs types and FQCN + strings for logitproc types + + Returns: + List of logitproc types + + """ + if not logits_processors: + return [] + + logger.debug( + "%s additional custom logits processors specified, checking whether " + "they need to be loaded.", len(logits_processors)) + + classes: list[type[LogitsProcessor]] = [] + for ldx, logitproc in enumerate(logits_processors): + if isinstance(logitproc, type): + logger.debug(" - Already-loaded logit processor: %s", + logitproc.__name__) + if not issubclass(logitproc, LogitsProcessor): + raise ValueError( + f"{logitproc.__name__} is not a subclass of LogitsProcessor" + ) + classes.append(logitproc) + continue + + logger.debug("- Loading logits processor %s", logitproc) + module_path, qualname = logitproc.split(":") + + try: + # Load module + module = importlib.import_module(module_path) + except Exception as e: + raise RuntimeError( + f"Failed to load {ldx}th LogitsProcessor plugin {logitproc}" + ) from e + + # Walk down dotted name to get logitproc class + obj = module + for attr in qualname.split("."): + obj = getattr(obj, attr) + if not isinstance(obj, type): + raise ValueError("Loaded logit processor must be a type.") + if not issubclass(obj, LogitsProcessor): + raise ValueError( + f"{obj.__name__} must be a subclass of LogitsProcessor") + classes.append(obj) + + return classes + + +def _load_custom_logitsprocs( + logits_processors: Optional[Sequence[Union[str, type[LogitsProcessor]]]], +) -> list[type[LogitsProcessor]]: + """Load all custom logits processors. + + * First load all installed logitproc plugins + * Second load custom logitsprocs pass by the user at initialization time + + Args: + logits_processors: potentially mixed list of logitproc types and + logitproc type fully-qualified names (FQCNs) + which need to be loaded + + Returns: + A list of all loaded logitproc types + """ + from vllm.platforms import current_platform + if current_platform.is_tpu(): + # No logitsprocs specified by caller + # TODO(andy) - vLLM V1 on TPU does not support custom logitsprocs + return [] + + return (_load_logitsprocs_plugins() + + _load_logitsprocs_by_fqcns(logits_processors)) + + +def build_logitsprocs( + vllm_config: "VllmConfig", + device: torch.device, + is_pin_memory: bool, + is_pooling_model: bool, + custom_logitsprocs: Sequence[Union[str, type[LogitsProcessor]]] = (), +) -> LogitsProcessors: + if is_pooling_model: + if custom_logitsprocs: + raise ValueError(STR_POOLING_REJECTS_LOGITSPROCS) + logger.debug("Skipping logits processor loading because pooling models" + " do not support logits processors.") + return LogitsProcessors() + custom_logitsprocs_classes = _load_custom_logitsprocs(custom_logitsprocs) + return LogitsProcessors( + ctor(vllm_config, device, is_pin_memory) for ctor in itertools.chain( + BUILTIN_LOGITS_PROCESSORS, custom_logitsprocs_classes)) + + +__all__ = [ + "LogitsProcessor", "LogitBiasLogitsProcessor", "MinPLogitsProcessor", + "MinTokensLogitsProcessor", "BatchUpdate", "BatchUpdateBuilder", + "MoveDirectionality", "LogitsProcessors", "build_logitsprocs", + "STR_POOLING_REJECTS_LOGITSPROCS", "LOGITSPROCS_GROUP" +] diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor/builtin.py similarity index 54% rename from vllm/v1/sample/logits_processor.py rename to vllm/v1/sample/logits_processor/builtin.py index 3a06e71057..24387ab793 100644 --- a/vllm/v1/sample/logits_processor.py +++ b/vllm/v1/sample/logits_processor/builtin.py @@ -1,241 +1,32 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import dataclasses -from abc import ABC, abstractmethod -from collections.abc import Iterator, Sequence -from dataclasses import dataclass, field -from enum import Enum -from itertools import chain -from typing import Optional, Union +from collections.abc import Sequence +from typing import TYPE_CHECKING, Optional import torch -from torch._prims_common import DeviceLikeType -from vllm import PoolingParams, SamplingParams -from vllm.logger import init_logger +from vllm.v1.sample.logits_processor.interface import (BatchUpdate, + LogitsProcessor, + MoveDirectionality) -logger = init_logger(__name__) - - -class MoveDirectionality(Enum): - # One-way i1->i2 req move within batch - UNIDIRECTIONAL = 0 - # Two-way i1<->i2 req swap within batch - SWAP = 1 - - -# (index, params, output_tok_ids) tuples for new -# requests added to the batch. -AddedRequest = tuple[int, Union[SamplingParams, PoolingParams], list[int]] -# (index 1, index 2, directionality) tuples representing -# one-way moves or two-way swaps of requests in batch -MovedRequest = tuple[int, int, MoveDirectionality] -# Batch indices of any removed requests. -RemovedRequest = int - - -@dataclasses.dataclass(frozen=True) -class BatchUpdate: - """Persistent batch state change info for logitsprocs""" - batch_size: int # Current num reqs in batch - - # Metadata for requests added to, removed from, and moved - # within the persistent batch. - # - # Note: each added request is represented as - # (index, params, output_tok_ids) - # Key assumption: output_tok_ids is a reference to the - # request's running output tokens list; in this way - # the logits processors always see the latest list of - # generated tokens - removed: Sequence[RemovedRequest] - moved: Sequence[MovedRequest] - added: Sequence[AddedRequest] - - -class BatchUpdateBuilder: - """Helps track persistent batch state changes and build - a batch update data structure for logitsprocs - - Assumptions: - * All information about requests removed from persistent batch - during a step is aggregated in self._removed through calls to - self.removed_append() at the beginning of a step. This must happen - before the first time that self.removed, self.pop_removed() - or self.peek_removed() are invoked in a given step - * After the first time that self.removed, self.pop_removed() - or self.peek_removed() are read in a step, no new removals - are registered using self.removed_append() - * Elements of self._removed are never directly modified, added or - removed (i.e. modification is only via self.removed_append() and - self.pop_removed()) - - Guarantees under above assumptions: - * self.removed is always sorted in descending order - * self.pop_removed() and self.peek_removed() both return - the lowest removed request index in the current step - """ - - _removed: list[RemovedRequest] - _is_removed_sorted: bool - moved: list[MovedRequest] - added: list[AddedRequest] - - def __init__( - self, - removed: Optional[list[RemovedRequest]] = None, - moved: Optional[list[MovedRequest]] = None, - added: Optional[list[AddedRequest]] = None, - ) -> None: - self._removed = removed or [] - self.moved = moved or [] - self.added = added or [] - self._is_removed_sorted = False - - def _ensure_removed_sorted(self) -> None: - """Sort removed request indices in - descending order. - - Idempotent after first call in a - given step, until reset. - """ - if not self._is_removed_sorted: - self._removed.sort(reverse=True) - self._is_removed_sorted = True - - @property - def removed(self) -> list[RemovedRequest]: - """Removed request indices sorted in - descending order""" - self._ensure_removed_sorted() - return self._removed - - def removed_append(self, index: int) -> None: - """Register the removal of a request from - the persistent batch. - - Must not be called after the first time - self.removed, self.pop_removed() or - self.peek_removed() are invoked. - - Args: - index: request index - """ - if self._is_removed_sorted: - raise RuntimeError("Cannot register new removed request after" - " self.removed has been read.") - self._removed.append(index) - - def has_removed(self) -> bool: - return bool(self._removed) - - def peek_removed(self) -> Optional[int]: - """Return lowest removed request index""" - if self.has_removed(): - self._ensure_removed_sorted() - return self._removed[-1] - return None - - def pop_removed(self) -> Optional[int]: - """Pop lowest removed request index""" - if self.has_removed(): - self._ensure_removed_sorted() - return self._removed.pop() - return None - - def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]: - """Generate a logitsprocs batch update data structure - and reset internal batch update builder state. - - Args: - batch_size: current persistent batch size - - Returns: - Frozen logitsprocs batch update instance; `None` if no updates - """ - # Reset removal-sorting logic - self._is_removed_sorted = False - if not any((self._removed, self.moved, self.added)): - # No update; short-circuit - return None - # Build batch state update - batch_update = BatchUpdate( - batch_size=batch_size, - removed=self._removed, - moved=self.moved, - added=self.added, - ) - # Reset removed/moved/added update lists - self._removed = [] - self.moved = [] - self.added = [] - return batch_update - - -class LogitsProcessor(ABC): - - @abstractmethod - def apply(self, logits: torch.Tensor) -> torch.Tensor: - raise NotImplementedError - - @abstractmethod - def is_argmax_invariant(self) -> bool: - """True if logits processor has no impact on the - argmax computation in greedy sampling. - NOTE: may or may not have the same value for all - instances of a given LogitsProcessor subclass, - depending on subclass implementation. - TODO(andy): won't be utilized until logits - processors are user-extensible - """ - raise NotImplementedError - - @abstractmethod - def update_state( - self, - batch_update: Optional[BatchUpdate], - ) -> None: - """Called when there are new output tokens, prior - to each forward pass. - - Args: - batch_update is non-None iff there have been - changes to the batch makeup. - """ - raise NotImplementedError - - -@dataclass -class LogitsProcessorManager: - """Encapsulates initialized logitsproc objects.""" - argmax_invariant: list[LogitsProcessor] = field( - default_factory=list) # argmax-invariant logitsprocs - non_argmax_invariant: list[LogitsProcessor] = field( - default_factory=list) # non-argmax-invariant logitsprocs - - @property - def all(self) -> Iterator[LogitsProcessor]: - """Iterator over all logits processors.""" - return chain(self.argmax_invariant, self.non_argmax_invariant) - - -###### ----- Built-in LogitsProcessor impls below here +if TYPE_CHECKING: + from vllm.config import VllmConfig class MinPLogitsProcessor(LogitsProcessor): - def __init__(self, max_num_reqs: int, pin_memory: bool, - device: DeviceLikeType): - super().__init__() + def __init__(self, vllm_config: "VllmConfig", device: torch.device, + is_pin_memory: bool): + max_num_reqs = vllm_config.scheduler_config.max_num_seqs self.min_p_count: int = 0 self.min_p_cpu_tensor = torch.zeros((max_num_reqs, ), dtype=torch.float32, device="cpu", - pin_memory=pin_memory) + pin_memory=is_pin_memory) self.min_p_cpu = self.min_p_cpu_tensor.numpy() - self.use_double_tensor = torch.device("cpu") != torch.device(device) + self.use_double_tensor = torch.device(device).type != "cpu" if self.use_double_tensor: # Pre-allocated device tensor @@ -260,8 +51,8 @@ class MinPLogitsProcessor(LogitsProcessor): needs_update = False # Process added requests. - for index, params, _ in batch_update.added: - min_p = params.min_p if isinstance(params, SamplingParams) else 0.0 + for index, params, _, _ in batch_update.added: + min_p = params.min_p if self.min_p_cpu[index] != min_p: needs_update = True self.min_p_cpu[index] = min_p @@ -316,11 +107,10 @@ class MinPLogitsProcessor(LogitsProcessor): class LogitBiasLogitsProcessor(LogitsProcessor): - def __init__(self, pin_memory: bool, device: torch.device): - super().__init__() - self.biases: dict[int, dict[int, float]] = {} + def __init__(self, _, device: torch.device, is_pin_memory: bool): self.device = device - self.pin_memory = pin_memory + self.pin_memory = is_pin_memory + self.biases: dict[int, dict[int, float]] = {} self.bias_tensor: torch.Tensor = torch.tensor(()) self.logits_slice = (self._device_tensor([], torch.int32), @@ -337,9 +127,8 @@ class LogitBiasLogitsProcessor(LogitsProcessor): needs_update: bool = False # Process added requests. - for index, params, _ in batch_update.added: - if isinstance(params, SamplingParams) and (lb := - params.logit_bias): + for index, params, _, _ in batch_update.added: + if lb := params.logit_bias: self.biases[index] = lb needs_update = True else: @@ -400,12 +189,12 @@ class LogitBiasLogitsProcessor(LogitsProcessor): class MinTokensLogitsProcessor(LogitsProcessor): - def __init__(self, pin_memory: bool, device: torch.device): + def __init__(self, vllm_config: "VllmConfig", device: torch.device, + is_pin_memory: bool): # index -> (min_toks, output_token_ids, stop_token_ids) - super().__init__() - self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {} self.device = device - self.pin_memory = pin_memory + self.pin_memory = is_pin_memory + self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {} # (req_idx_tensor,eos_tok_id_tensor) self.logits_slice: tuple[torch.Tensor, @@ -424,9 +213,8 @@ class MinTokensLogitsProcessor(LogitsProcessor): if batch_update: # Process added requests. - for index, params, output_tok_ids in batch_update.added: - if (isinstance(params, SamplingParams) - and (min_tokens := params.min_tokens) + for index, params, _, output_tok_ids in batch_update.added: + if ((min_tokens := params.min_tokens) and len(output_tok_ids) < min_tokens): # Replace request metadata at batch index self.min_toks[index] = (min_tokens, output_tok_ids, @@ -499,35 +287,3 @@ class MinTokensLogitsProcessor(LogitsProcessor): # Inhibit EOS token for requests which have not reached min length logits[self.logits_slice] = -float("inf") return logits - - -def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int, - device: torch.device) -> LogitsProcessorManager: - """Construct 'builtin' vLLM logitsprocs which the engine - loads by default. - - Args: - pin_memory_available: pinned memory is available for use - for use by logitsproc - max_num_reqs: ceiling on request count in persistent batch - device: inference device - - Returns: - Data structure encapsulating loaded logitsprocs - """ - min_tokens_logitproc = MinTokensLogitsProcessor( - pin_memory=pin_memory_available, device=device) - logit_bias_logitproc = LogitBiasLogitsProcessor( - pin_memory=pin_memory_available, device=device) - min_p_logitproc = MinPLogitsProcessor( - pin_memory=pin_memory_available, - device=device, - # +1 for temporary swap space - max_num_reqs=max_num_reqs + 1) - return LogitsProcessorManager( - non_argmax_invariant=[ - min_tokens_logitproc, - logit_bias_logitproc, - ], - argmax_invariant=[min_p_logitproc], - ) diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py new file mode 100644 index 0000000000..12b4db24bf --- /dev/null +++ b/vllm/v1/sample/logits_processor/interface.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod +from collections.abc import Sequence +from dataclasses import dataclass +from enum import Enum, auto +from typing import TYPE_CHECKING, Optional + +import torch + +from vllm import SamplingParams + +if TYPE_CHECKING: + from vllm.config import VllmConfig + + +class MoveDirectionality(Enum): + # One-way i1->i2 req move within batch + UNIDIRECTIONAL = auto() + # Two-way i1<->i2 req swap within batch + SWAP = auto() + + +# (index, params, prompt_tok_ids, output_tok_ids) tuples for new +# requests added to the batch. +AddedRequest = tuple[int, SamplingParams, list[int], list[int]] + +# (index 1, index 2, directionality) tuples representing +# one-way moves or two-way swaps of requests in batch +MovedRequest = tuple[int, int, MoveDirectionality] + +# Batch indices of any removed requests. +RemovedRequest = int + + +@dataclass(frozen=True) +class BatchUpdate: + """Persistent batch state change info for logitsprocs""" + batch_size: int # Current num reqs in batch + + # Metadata for requests added to, removed from, and moved + # within the persistent batch. + # + # Key assumption: the `output_tok_ids` list (which is an element of each + # tuple in `added`) is a reference to the request's running output tokens + # list; via this reference, the logits processors always see the latest + # list of generated output tokens + removed: Sequence[RemovedRequest] + moved: Sequence[MovedRequest] + added: Sequence[AddedRequest] + + +class LogitsProcessor(ABC): + + @abstractmethod + def __init__(self, vllm_config: "VllmConfig", device: torch.device, + is_pin_memory: bool) -> None: + raise NotImplementedError + + @abstractmethod + def apply(self, logits: torch.Tensor) -> torch.Tensor: + raise NotImplementedError + + @abstractmethod + def is_argmax_invariant(self) -> bool: + """True if logits processor has no impact on the + argmax computation in greedy sampling. + NOTE: may or may not have the same value for all + instances of a given LogitsProcessor subclass, + depending on subclass implementation. + """ + raise NotImplementedError + + @abstractmethod + def update_state( + self, + batch_update: Optional["BatchUpdate"], + ) -> None: + """Called when there are new output tokens, prior + to each forward pass. + + Args: + batch_update is non-None iff there have been + changes to the batch makeup. + """ + raise NotImplementedError diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py new file mode 100644 index 0000000000..0f58b52496 --- /dev/null +++ b/vllm/v1/sample/logits_processor/state.py @@ -0,0 +1,149 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterator +from itertools import chain +from typing import TYPE_CHECKING, Optional + +from vllm.v1.sample.logits_processor.interface import (AddedRequest, + BatchUpdate, + MovedRequest, + RemovedRequest) + +if TYPE_CHECKING: + from vllm.v1.sample.logits_processor.interface import LogitsProcessor + + +class BatchUpdateBuilder: + """Helps track persistent batch state changes and build + a batch update data structure for logitsprocs + Assumptions: + * All information about requests removed from persistent batch + during a step is aggregated in self._removed through calls to + self.removed_append() at the beginning of a step. This must happen + before the first time that self.removed, self.pop_removed() + or self.peek_removed() are invoked in a given step + * After the first time that self.removed, self.pop_removed() + or self.peek_removed() are read in a step, no new removals + are registered using self.removed_append() + * Elements of self._removed are never directly modified, added or + removed (i.e. modification is only via self.removed_append() and + self.pop_removed()) + Guarantees under above assumptions: + * self.removed is always sorted in descending order + * self.pop_removed() and self.peek_removed() both return + the lowest removed request index in the current step + """ + + _removed: list[RemovedRequest] + _is_removed_sorted: bool + moved: list[MovedRequest] + added: list[AddedRequest] + + def __init__( + self, + removed: Optional[list[RemovedRequest]] = None, + moved: Optional[list[MovedRequest]] = None, + added: Optional[list[AddedRequest]] = None, + ) -> None: + self._removed = removed or [] + self.moved = moved or [] + self.added = added or [] + self._is_removed_sorted = False + + def _ensure_removed_sorted(self) -> None: + """Sort removed request indices in + descending order. + Idempotent after first call in a + given step, until reset. + """ + if not self._is_removed_sorted: + self._removed.sort(reverse=True) + self._is_removed_sorted = True + + @property + def removed(self) -> list[RemovedRequest]: + """Removed request indices sorted in + descending order""" + self._ensure_removed_sorted() + return self._removed + + def removed_append(self, index: int) -> None: + """Register the removal of a request from the persistent batch. + + Must not be called after the first time self.removed, + self.pop_removed() or self.peek_removed() are invoked. + + Args: + index: request index + """ + if self._is_removed_sorted: + raise RuntimeError("Cannot register new removed request after" + " self.removed has been read.") + self._removed.append(index) + + def has_removed(self) -> bool: + return bool(self._removed) + + def peek_removed(self) -> Optional[int]: + """Return lowest removed request index""" + if self.has_removed(): + self._ensure_removed_sorted() + return self._removed[-1] + return None + + def pop_removed(self) -> Optional[int]: + """Pop lowest removed request index""" + if self.has_removed(): + self._ensure_removed_sorted() + return self._removed.pop() + return None + + def _is_update(self) -> bool: + """True if there is a batch state change""" + return any((self._removed, self.moved, self.added)) + + def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]: + """Generate a logitsprocs batch update data structure and reset + internal batch update builder state. + + Args: + batch_size: current persistent batch size + + Returns: + Frozen logitsprocs batch update instance; `None` if no updates + """ + # Reset removal-sorting logic + self._is_removed_sorted = False + if not self._is_update(): + # No update; short-circuit + return None + # Build batch state update + batch_update = BatchUpdate( + batch_size=batch_size, + removed=self._removed, + moved=self.moved, + added=self.added, + ) + self._removed = [] + self.moved = [] + self.added = [] + return batch_update + + +class LogitsProcessors: + """Encapsulates initialized logitsproc objects.""" + + def __init__( + self, + logitsprocs: Optional[Iterator["LogitsProcessor"]] = None) -> None: + self.argmax_invariant: list[LogitsProcessor] = [] + self.non_argmax_invariant: list[LogitsProcessor] = [] + if logitsprocs: + for logitproc in logitsprocs: + (self.argmax_invariant if logitproc.is_argmax_invariant() else + self.non_argmax_invariant).append(logitproc) + + @property + def all(self) -> Iterator["LogitsProcessor"]: + """Iterator over all logits processors.""" + return chain(self.argmax_invariant, self.non_argmax_invariant) diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py index 1189b12f30..9d6a87cea3 100644 --- a/vllm/v1/sample/metadata.py +++ b/vllm/v1/sample/metadata.py @@ -6,7 +6,7 @@ from typing import Optional import torch -from vllm.v1.sample.logits_processor import LogitsProcessorManager +from vllm.v1.sample.logits_processor import LogitsProcessors @dataclass @@ -40,4 +40,4 @@ class SamplingMetadata: bad_words_token_ids: dict[int, list[list[int]]] # Loaded logits processors - logitsprocs: LogitsProcessorManager + logitsprocs: LogitsProcessors diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 2469e09f82..e718d9d5e0 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -18,8 +18,8 @@ from vllm.utils import swap_dict_values from vllm.v1.outputs import LogprobsTensors from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import (BatchUpdateBuilder, - MoveDirectionality, - init_builtin_logitsprocs) + LogitsProcessors, + MoveDirectionality) from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.spec_decode.utils import is_spec_decode_unsupported from vllm.v1.utils import copy_slice @@ -78,8 +78,11 @@ class InputBatch: pin_memory: bool, vocab_size: int, block_sizes: list[int], # The block_size of each kv cache group + logitsprocs: Optional[LogitsProcessors] = None, is_spec_decode: bool = False, + is_pooling_model: bool = False, ): + self.is_pooling_model = is_pooling_model self.is_spec_decode = is_spec_decode self.max_num_reqs = max_num_reqs self.max_model_len = max_model_len @@ -221,14 +224,6 @@ class InputBatch: # updates. Should reset each step. self.batch_update_builder = BatchUpdateBuilder() - # Define logits processors. - # TODO(andy): logits processor list should be extensible via engine - # constructor argument; for now the list is fixed. - self.logitsprocs = init_builtin_logitsprocs( - pin_memory_available=pin_memory, - max_num_reqs=max_num_reqs + 1, - device=device) - # TODO convert this to LogitsProcessor self.has_allowed_token_ids: set[str] = set() # NOTE(lufang): In the mask tensor, if the corresponding token allowed, @@ -244,6 +239,10 @@ class InputBatch: self.req_output_token_ids: list[Optional[list[int]]] = [] + # Store provided logitsprocs. If none are provided, initialize empty + # data structure + self.logitsprocs = logitsprocs or LogitsProcessors() + # This is updated each time the batch constituents change. self.sampling_metadata = self._make_sampling_metadata() @@ -255,28 +254,35 @@ class InputBatch: # while performing state updates to the batch. return cast(list[str], self._req_ids) - def _get_next_add_index(self) -> int: - if (req_index := self.batch_update_builder.pop_removed()) is not None: - # Fill the empty index. - return req_index - # Append to end - return self.num_reqs - def _register_add_request(self, request: "CachedRequestState") -> int: - """Track add-request operations""" - req_index = self._get_next_add_index() - assert req_index < self.max_num_reqs - params = (request.sampling_params - if request.sampling_params else request.pooling_params) + """Track add-request operations for logits processors. + Not applicable to pooling models. + """ + + # Detailed added request metadata is only required for non-pooling + # models, to support logitsprocs + assert request.sampling_params + + # Fill the next empty index if there is one. + if (new_req_index := self.batch_update_builder.pop_removed()) is None: + # Append to end otherwise. + new_req_index = self.num_reqs + + assert new_req_index < self.max_num_reqs self.batch_update_builder.added.append( - (req_index, params, request.output_token_ids)) - return req_index + (new_req_index, request.sampling_params, request.prompt_token_ids, + request.output_token_ids)) + return new_req_index def add_request( self, request: "CachedRequestState", ) -> int: - req_index = self._register_add_request(request) + if not self.is_pooling_model: + # New request index bookkeeping for autoregressive models. + req_index = self._register_add_request(request) + else: + req_index = self.num_reqs req_id = request.req_id if req_index == len(self._req_ids): @@ -411,7 +417,10 @@ class InputBatch: req_index = self.req_id_to_index.pop(req_id, None) if req_index is None: return None - self.batch_update_builder.removed_append(req_index) + if not self.is_pooling_model: + # Autoregressive models require bookkeeping of removed requests to + # support logitsprocs. + self.batch_update_builder.removed_append(req_index) self._req_ids[req_index] = None self.req_output_token_ids[req_index] = None @@ -446,6 +455,8 @@ class InputBatch: return req_index def swap_states(self, i1: int, i2: int) -> None: + # For autoregressive models, track detailed request reordering info + # to support logitsprocs self.batch_update_builder.moved.append( (i1, i2, MoveDirectionality.SWAP)) old_id_i1 = self._req_ids[i1] @@ -513,11 +524,18 @@ class InputBatch: swaps: list of (from,to) swap tuples for moved requests empty_req_indices: indices not filled by condensation """ + num_reqs = self.num_reqs + + if self.is_pooling_model: + # Will be contiguous in pooling case, just trim the lists. + del self._req_ids[num_reqs:] + del self.req_output_token_ids[num_reqs:] + return + if not (empty_req_indices := self.batch_update_builder.removed): # All removed requests were replaced by added requests, or else no # requests were removed at all. No condense() needed return - num_reqs = self.num_reqs if num_reqs == 0: # The batched states are empty. self._req_ids.clear() @@ -541,6 +559,8 @@ class InputBatch: # Move active request down into empty request # index. self.batch_update_builder.pop_removed() + # Autoregressive models require detailed tracking of condense + # operations to support logitsprocs self.batch_update_builder.moved.append( (last_req_index, empty_index, MoveDirectionality.UNIDIRECTIONAL)) @@ -596,15 +616,20 @@ class InputBatch: last_req_index -= 1 # Trim lists to the batch size. - del self._req_ids[self.num_reqs:] - del self.req_output_token_ids[self.num_reqs:] + del self._req_ids[num_reqs:] + del self.req_output_token_ids[num_reqs:] def refresh_metadata(self): - """Apply batch updates, reset input batch at end of step + """Apply any batch updates to sampling metadata.""" - * Apply batch add/remove/permute to logits procs' states - * If batch state is modified, update sampling metadata - """ + if self.is_pooling_model: + # Batch changes every step for pooling models. + self.sampling_metadata = self._make_sampling_metadata() + return + + # For non-pooling models - generate and apply logitsprocs update; + # reset batch update tracking. + # Update sampling metadata if batch state is changed. batch_update = self.batch_update_builder.get_and_reset(self.num_reqs) for logit_proc in self.logitsprocs.all: logit_proc.update_state(batch_update) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5ee44a8257..4219d9147a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -68,6 +68,7 @@ from vllm.v1.kv_cache_interface import (AttentionSpec, from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors, ModelRunnerOutput) from vllm.v1.pool.metadata import PoolingMetadata +from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.rejection_sampler import RejectionSampler from vllm.v1.sample.sampler import Sampler @@ -80,7 +81,6 @@ from vllm.v1.worker.kv_connector_model_runner_mixin import ( KVConnectorModelRunnerMixin, KVConnectorOutput) from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin -from ..sample.logits_processor import LogitsProcessorManager from .utils import (AttentionGroup, MultiModalBudget, bind_kv_cache, gather_mm_placeholders, initialize_kv_cache_for_kv_sharing, sanity_check_mm_encoder_outputs, scatter_mm_placeholders) @@ -221,6 +221,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): vocab_size=self.model_config.get_vocab_size(), block_sizes=[self.cache_config.block_size], is_spec_decode=bool(self.vllm_config.speculative_config), + logitsprocs=build_logitsprocs( + self.vllm_config, self.device, self.pin_memory, + self.is_pooling_model, + self.vllm_config.model_config.logits_processors), + is_pooling_model=self.is_pooling_model, ) # TODO(woosuk): Provide an option to tune the max cudagraph batch size. @@ -2447,7 +2452,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): output_token_ids=[[] for _ in range(num_reqs)], allowed_token_ids_mask=None, bad_words_token_ids={}, - logitsprocs=LogitsProcessorManager(), + logitsprocs=LogitsProcessors(), ) try: sampler_output = self.sampler(logits=logits, @@ -2968,6 +2973,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): vocab_size=self.model_config.get_vocab_size(), block_sizes=block_sizes, is_spec_decode=bool(self.vllm_config.speculative_config), + logitsprocs=self.input_batch.logitsprocs, + is_pooling_model=self.is_pooling_model, ) def _allocate_kv_cache_tensors( From a258ad8bcc0014c04d11a9bc8c6591b379c31b68 Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Sun, 17 Aug 2025 08:41:23 +0800 Subject: [PATCH 341/932] [Bugfix] fix qwen3 moe fp8 accuracy issue (#23031) Signed-off-by: Jinzhen Lin --- vllm/model_executor/layers/quantization/fp8.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index a497449132..f07be08554 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -125,6 +125,10 @@ class Fp8Config(QuantizationConfig): ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None) weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"], None) + if not ignored_layers: + ignored_layers = cls.get_from_keys_or(config, + ["modules_to_not_convert"], + None) return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized, activation_scheme=activation_scheme, ignored_layers=ignored_layers, From 94096a47c92c4a53ad44cfffdca918669c0f89e0 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 16 Aug 2025 22:16:42 -0400 Subject: [PATCH 342/932] [UX] Separate marlin moe config logic from triton moe (#23006) --- .../layers/fused_moe/fused_marlin_moe.py | 20 ++++++------------- .../layers/fused_moe/fused_moe.py | 9 +-------- 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index a49d41c184..3c6ece6737 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -1,14 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Fused MoE utilities for GPTQ.""" -import functools from typing import Optional import torch import vllm._custom_ops as ops -from vllm.model_executor.layers.fused_moe.fused_moe import ( - moe_align_block_size, try_get_optimal_moe_config) +from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size from vllm.model_executor.layers.quantization.utils.marlin_utils import ( marlin_make_workspace_new, maybe_warn_marlin_atomic_add) from vllm.scalar_type import ScalarType, scalar_types @@ -98,17 +96,11 @@ def fused_marlin_moe(hidden_states: torch.Tensor, N = w2.shape[1] * 16 topk = topk_ids.shape[1] - get_config_func = functools.partial( - try_get_optimal_moe_config, - w1.shape, - w2.shape, - topk_ids.shape[1], - None, - is_marlin=True, - ) - config = get_config_func(M) - - block_size_m = config["BLOCK_SIZE_M"] + # M block size selection logic + # TODO: tune this further for specific models + for block_size_m in [8, 16, 32, 48, 64]: + if M * topk / E / block_size_m < 0.9: + break if global_num_experts == -1: global_num_experts = E diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index e58a9e568d..3579ca22ba 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -801,7 +801,6 @@ def get_default_config( K: int, topk: int, dtype: Optional[str], - is_marlin: bool, block_shape: Optional[list[int]] = None, ) -> dict[str, int]: if dtype == "fp8_w8a8" and block_shape is not None: @@ -832,11 +831,6 @@ def get_default_config( config = {"BLOCK_SIZE_M": 32, "GROUP_SIZE_M": 1} else: config = {"BLOCK_SIZE_M": 64, "GROUP_SIZE_M": 1} - elif is_marlin: - for block_size_m in [8, 16, 32, 48, 64]: - if M * topk / E / block_size_m < 0.9: - break - return {"BLOCK_SIZE_M": block_size_m} elif M <= E: config = { "BLOCK_SIZE_M": 16, @@ -860,7 +854,6 @@ def try_get_optimal_moe_config( top_k: int, dtype: Optional[str], M: int, - is_marlin: bool = False, block_shape: Optional[list[int]] = None, ) -> dict[str, int]: from vllm.model_executor.layers.fused_moe import get_config @@ -883,7 +876,7 @@ def try_get_optimal_moe_config( else: # Else use the default config config = get_default_config(M, E, N, w1_shape[2], top_k, dtype, - is_marlin, block_shape) + block_shape) return config From 5c32143b9db19ae728087019678843fa238afa82 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 17 Aug 2025 12:05:50 +0800 Subject: [PATCH 343/932] [Refactor] Defer tensor data construction in MultiModalKwargs (#23030) Signed-off-by: DarkLight1337 --- tests/multimodal/test_cache.py | 2 +- tests/v1/test_serial_utils.py | 34 +------ vllm/inputs/registry.py | 2 +- .../models/prithvi_geospatial_mae.py | 2 +- vllm/multimodal/base.py | 2 +- vllm/multimodal/cache.py | 2 +- vllm/multimodal/inputs.py | 96 +++++++++++-------- vllm/multimodal/processing.py | 2 +- vllm/multimodal/utils.py | 12 ++- vllm/sequence.py | 4 +- vllm/v1/serial_utils.py | 17 +--- vllm/v1/worker/gpu_input_batch.py | 2 +- 12 files changed, 73 insertions(+), 104 deletions(-) diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py index e07b73bd25..2149f05b6a 100644 --- a/tests/multimodal/test_cache.py +++ b/tests/multimodal/test_cache.py @@ -25,7 +25,7 @@ def _dummy_item(modality: str, size_by_key: dict[str, int]): def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]): - return MultiModalKwargs.from_items([ + return MultiModalKwargs([ _dummy_item(modality, size_by_key) for modality, size_by_key in size_by_key_modality.items() ]) diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py index 0ab4e0bf59..586276ee08 100644 --- a/tests/v1/test_serial_utils.py +++ b/tests/v1/test_serial_utils.py @@ -100,38 +100,6 @@ class MyRequest(msgspec.Struct): def test_multimodal_kwargs(): - d = { - "foo": - torch.zeros(20000, dtype=torch.float16), - "bar": [torch.zeros(i * 1000, dtype=torch.int8) for i in range(3)], - "baz": [ - torch.rand((256), dtype=torch.float16), - [ - torch.rand((1, 12), dtype=torch.float32), - torch.rand((3, 5, 7), dtype=torch.float64), - ], [torch.rand((4, 4), dtype=torch.float16)] - ], - } - - # pack mm kwargs into a mock request so that it can be decoded properly - req = MyRequest(mm=[MultiModalKwargs(d)]) - - encoder = MsgpackEncoder() - decoder = MsgpackDecoder(MyRequest) - - encoded = encoder.encode(req) - - assert len(encoded) == 6 - - total_len = sum(memoryview(x).cast("B").nbytes for x in encoded) - - # expected total encoding length, should be 44559, +-20 for minor changes - assert 44539 <= total_len <= 44579 - decoded: MultiModalKwargs = decoder.decode(encoded).mm[0] - assert all(nested_equal(d[k], decoded[k]) for k in d) - - -def test_multimodal_items_by_modality(): e1 = MultiModalFieldElem("audio", "a0", torch.zeros(1000, dtype=torch.bfloat16), MultiModalBatchedField()) @@ -151,7 +119,7 @@ def test_multimodal_items_by_modality(): audio = MultiModalKwargsItem.from_elems([e1]) video = MultiModalKwargsItem.from_elems([e2]) image = MultiModalKwargsItem.from_elems([e3, e4]) - mm = MultiModalKwargs.from_items([audio, video, image]) + mm = MultiModalKwargs([audio, video, image]) # pack mm kwargs into a mock request so that it can be decoded properly req = MyRequest([mm]) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index dc32365083..ef146fdfbf 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -240,6 +240,6 @@ class InputRegistry: return DummyData( seq_data=SequenceData.from_seqs(dec_data.prompt_token_ids), - multi_modal_data=dec_data.multi_modal_data, + multi_modal_data=dec_data.multi_modal_data.get_data(), multi_modal_placeholders=dec_data.multi_modal_placeholders, ) diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index 20f423cc76..6848882907 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -136,7 +136,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor): type="multimodal", prompt=prompt, prompt_token_ids=[1], - mm_kwargs=MultiModalKwargs.from_items(multimodal_kwargs_items), + mm_kwargs=MultiModalKwargs(multimodal_kwargs_items), mm_hashes=None, mm_placeholders=mm_placeholders, ) diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 7188ed14c5..ef8f1b2e17 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -99,7 +99,7 @@ class MultiModalPlaceholderMap: seq_mm_placeholders = seq_group.multi_modal_placeholders if not seq_mm_data or not seq_mm_placeholders: - return MultiModalKwargs({}), {} + return MultiModalKwargs(), {} placeholder_maps = dict[str, MultiModalPlaceholderMap]() diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 6074a4d54f..8c4136e06f 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -46,7 +46,7 @@ class MultiModalCache: ) -> int: # MultiModalKwargs is not a subclass of dict if isinstance(leaf, MultiModalKwargs): - return cls.get_item_size(leaf.data, debug=debug) + return cls.get_item_size(leaf.get_data(), debug=debug) # MultiModalKwargsItem is not a subclass of dict if isinstance(leaf, MultiModalKwargsItem): diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index a33ce14699..d3f57cf533 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -653,7 +653,7 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): def from_elems(elems: Sequence[MultiModalFieldElem]): return MultiModalKwargsItem({elem.key: elem for elem in elems}) - def __init__(self, data: Mapping[str, MultiModalFieldElem]) -> None: + def __init__(self, data: Mapping[str, MultiModalFieldElem] = {}) -> None: super().__init__(data) modalities = {elem.modality for elem in self.data.values()} @@ -668,9 +668,7 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): return {key: elem.data for key, elem in self.items()} -# NOTE: UserDict is for V0 compatibility. -# V1 should access individual items via `get_item`. -class MultiModalKwargs(UserDict[str, NestedTensors]): +class MultiModalKwargs: """ A dictionary that represents the keyword arguments to [`torch.nn.Module.forward`][]. @@ -714,40 +712,16 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): elems = [v[item_idx] for v in elems_in_modality.values()] items.append(MultiModalKwargsItem.from_elems(elems)) - return MultiModalKwargs.from_items(items) + return MultiModalKwargs(items) - @staticmethod - def from_items( - items: Sequence[MultiModalKwargsItem], - *, - pin_memory: bool = False, - ): - """Construct a new - [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs] - from multiple items.""" - elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) - for item in items: - for key, elem in item.items(): - elems_by_key[key].append(elem) + def __init__(self, items: Sequence[MultiModalKwargsItem] = ()) -> None: + super().__init__() - data = { - key: elems[0].field.reduce_data(elems, pin_memory=pin_memory) - for key, elems in elems_by_key.items() if len(elems) > 0 - } - - return MultiModalKwargs(data, items=items) - - def __init__( - self, - data: Mapping[str, NestedTensors], - *, - items: Optional[Sequence[MultiModalKwargsItem]] = None, - ) -> None: - super().__init__(data) - - items_by_modality = full_groupby(items or [], key=lambda x: x.modality) + items_by_modality = full_groupby(items, key=lambda x: x.modality) self._items_by_modality = dict(items_by_modality) + self._data: Optional[Mapping[str, NestedTensors]] = None + @property def modalities(self): return self._items_by_modality.keys() @@ -839,22 +813,41 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): return cast(BatchedTensorInputs, json_mapped) - def __delitem__(self, key: str) -> None: - super().__delitem__(key) + def keys(self): + return self.get_data().keys() + + def values(self): + return self.get_data().values() + + def items(self): + return self.get_data().items() + + def get(self, key: str, /, default=None): + return self.get_data().get(key, default) + + def pop(self, key: str, *args, **kwargs): + data = dict(self.get_data()) + res = data.pop(key, *args, **kwargs) for items in self._items_by_modality.values(): for item in items: - item.pop(key, None) + item.pop(key, *args, **kwargs) + + self._data = None + + return res + + def __iter__(self): + return iter(self.get_data()) + + def __getitem__(self, key: str): + return self.get_data()[key] def __eq__(self, other: object) -> bool: if not isinstance(other, self.__class__): return False - if self._items_by_modality != other._items_by_modality: - return False - ks = self.keys() - return (ks == other.keys() - and all(nested_tensors_equal(self[k], other[k]) for k in ks)) + return self._items_by_modality == other._items_by_modality def _validate_modality(self, method_name: str, modality: str) -> None: if not self._items_by_modality: @@ -888,6 +881,25 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): self._validate_modality("get_items", modality) return self._items_by_modality[modality] + def get_data(self, + *, + pin_memory: bool = False) -> Mapping[str, NestedTensors]: + if self._data is not None: + return self._data + + elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) + for items in self._items_by_modality.values(): + for item in items: + for key, elem in item.items(): + elems_by_key[key].append(elem) + + data = { + key: elems[0].field.reduce_data(elems, pin_memory=pin_memory) + for key, elems in elems_by_key.items() if len(elems) > 0 + } + self._data = data + return data + MultiModalPlaceholderDict: TypeAlias = Mapping[str, Sequence[PlaceholderRange]] """ diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 38c5d5d99f..4684bf6f3d 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1480,7 +1480,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_missing_kwargs=mm_missing_kwargs, ) - mm_kwargs = MultiModalKwargs.from_items([ + mm_kwargs = MultiModalKwargs([ item for cache_items in mm_cache_items_merged.values() for item in cache_items ]) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index f914d0dc6c..a80f09bb19 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -402,12 +402,14 @@ def group_mm_kwargs_by_modality( for modality, items in groupby(mm_kwargs, key=lambda item: item.modality): items_lst = list(items) - # mm_kwargs_group = MultiModalKwargs.from_items(items_lst, - # pin_memory=pin_memory) + # mm_kwargs_group = MultiModalKwargs(items_lst) \ + # .get_data(pin_memory=pin_memory) # if device is not None: - # mm_kwargs_group = json_map_leaves(lambda x: x.to(device=device), - # mm_kwargs_group.data) + # mm_kwargs_group = json_map_leaves( + # lambda x: x.to(device=device), + # mm_kwargs_group, + # ) # TODO: Once V0 is removed, we can use the merging logic above # to avoid creating an extra batch dimension (except for fields @@ -415,7 +417,7 @@ def group_mm_kwargs_by_modality( # We will also need to update each model to remove `flatten_bn`. mm_kwargs_group = MultiModalKwargs.as_kwargs( MultiModalKwargs.batch( - [MultiModalKwargs.from_items([item]) for item in items_lst], + [MultiModalKwargs([item]) for item in items_lst], pin_memory=pin_memory, ), device=device, diff --git a/vllm/sequence.py b/vllm/sequence.py index cbe63f8d1d..b3be10b6bb 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -524,7 +524,7 @@ class Sequence: if self.inputs["type"] == "multimodal": return self.inputs["mm_kwargs"] - return MultiModalKwargs({}) + return MultiModalKwargs() @property def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: @@ -780,7 +780,7 @@ class SequenceGroup: return self.first_seq.multi_modal_data elif self.encoder_seq is not None: return self.encoder_seq.multi_modal_data - return MultiModalKwargs({}) + return MultiModalKwargs() @property def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 3f0fad8a64..2857d8ef42 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -117,16 +117,9 @@ class MsgpackEncoder: return self._encode_mm_item(obj) if isinstance(obj, MultiModalKwargs): - mm: MultiModalKwargs = obj - if not mm.modalities: - # just return the main dict if there are no modalities. - return dict(mm) - - # ignore the main dict, it will be re-indexed. - # Any tensors *not* indexed by modality will be ignored. return [ self._encode_mm_item(item) - for itemlist in mm._items_by_modality.values() + for itemlist in obj._items_by_modality.values() for item in itemlist ] @@ -268,13 +261,7 @@ class MsgpackDecoder: if issubclass(t, MultiModalKwargsItem): return self._decode_mm_item(obj) if issubclass(t, MultiModalKwargs): - if isinstance(obj, list): - return MultiModalKwargs.from_items( - self._decode_mm_items(obj)) - return MultiModalKwargs({ - k: self._decode_nested_tensors(v) - for k, v in obj.items() - }) + return MultiModalKwargs(self._decode_mm_items(obj)) if t is UtilityResult: return self._decode_utility_result(obj) return obj diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index e718d9d5e0..3d4cf27a6c 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -58,7 +58,7 @@ class CachedRequestState: @deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be " "removed in v0.13. Please use `mm_kwargs` instead.") def mm_inputs(self) -> list[MultiModalKwargs]: - return [MultiModalKwargs.from_items([item]) for item in self.mm_kwargs] + return [MultiModalKwargs([item]) for item in self.mm_kwargs] def get_token_id(self, idx: int) -> int: if idx < self.num_prompt_tokens: From 87f48623a537d379284bb3e3d1b23ab0ee2af1c1 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Sun, 17 Aug 2025 12:49:14 +0800 Subject: [PATCH 344/932] [Misc] method name typo fix (#23042) Signed-off-by: Andy Xie --- vllm/v1/worker/cpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 11b96d9463..a7180afbd6 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -29,7 +29,7 @@ class CPUModelRunner(GPUModelRunner): self.use_cuda_graph = False self.cascade_attn_enabled = False - self._postprocess_tenosrs() + self._postprocess_tensors() def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None: """ @@ -59,7 +59,7 @@ class CPUModelRunner(GPUModelRunner): self.attn_groups[0][0].metadata_builder.reorder_batch( self.input_batch, scheduler_output) - def _postprocess_tenosrs(self) -> None: + def _postprocess_tensors(self) -> None: # Note: replace device tensors with cpu tensors def replace_tensor(obj: Any, cpu_attr_name: str, device_attr_name) -> None: From 4d4061b6e73d82f7e561fff64c2bd914d66ebaff Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sun, 17 Aug 2025 13:03:24 +0800 Subject: [PATCH 345/932] [Kernel] Add cuda kernel for gpt_oss activation (#22951) Signed-off-by: Jee Jee Li --- csrc/activation_kernels.cu | 59 +++++++++++++++++++ csrc/ops.h | 2 + csrc/torch_bindings.cpp | 6 ++ tests/kernels/core/test_activation.py | 45 ++++++++++++-- vllm/model_executor/layers/activation.py | 41 ++++++++++++- .../layers/fused_moe/fused_marlin_moe.py | 22 ++----- .../layers/fused_moe/fused_moe.py | 18 ++---- .../layers/quantization/utils/mxfp4_utils.py | 4 +- vllm/model_executor/models/gpt_oss.py | 2 +- 9 files changed, 157 insertions(+), 42 deletions(-) diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 55e6596797..a4a880f13c 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -128,6 +128,45 @@ __global__ void act_and_mul_kernel_with_param( } } +template +__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up, + float alpha, float limit) { + // clamp gate: min=None, max=limit + const float gate_f = (float)gate; + const float clamped_gate = gate_f > limit ? limit : gate_f; + + // clamp up: min=-limit, max=limit + const float up_f = (float)up; + const float clamped_up = + up_f > limit ? limit : (up_f < -limit ? -limit : up_f); + + // glu = gate * sigmoid(gate * alpha) + const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha)); + const float glu = clamped_gate * sigmoid_val; + + // (up + 1) * glu + return (T)((clamped_up + 1.0f) * glu); +} + +template +__global__ void swigluoai_and_mul_kernel( + scalar_t* __restrict__ out, // [..., d] + const scalar_t* __restrict__ input, // [..., 2, d] + const int d, const float alpha, const float limit) { + const int64_t token_idx = blockIdx.x; + // TODO: Vectorize loads and stores. + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + // gate = x[..., ::2] (even indices) + const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]); + // up = x[..., 1::2] (odd indices) + const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]); + + out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit); + } +} + } // namespace vllm #define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \ @@ -145,11 +184,31 @@ __global__ void act_and_mul_kernel_with_param( PARAM); \ }); +#define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT) \ + int d = input.size(-1) / 2; \ + int64_t num_tokens = input.numel() / input.size(-1); \ + dim3 grid(num_tokens); \ + dim3 block(std::min(d, 1024)); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ + VLLM_DISPATCH_FLOATING_TYPES( \ + input.scalar_type(), "clamp_swiglu_kernel_with_params", [&] { \ + vllm::swigluoai_and_mul_kernel> \ + <<>>(out.data_ptr(), \ + input.data_ptr(), d, ALPHA, \ + LIMIT); \ + }); + void fatrelu_and_mul(torch::Tensor& out, // [..., d], torch::Tensor& input, // [..., 2 * d] double threshold) { LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold); } +void swigluoai_and_mul(torch::Tensor& out, // [..., d] + torch::Tensor& input, // [..., 2 * d] + double alpha, double limit) { + LAUNCH_SIGLUOAI_AND_MUL(vllm::swigluoai_and_mul, alpha, limit); +} namespace vllm { // Element-wise activation kernel template. diff --git a/csrc/ops.h b/csrc/ops.h index 6e39758f16..64bcec6ca1 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -138,6 +138,8 @@ void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input); void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input, double threshold); +void swigluoai_and_mul(torch::Tensor& out, torch::Tensor& input, + double alpha = 1.702, double limit = 7.0); void gelu_new(torch::Tensor& out, torch::Tensor& input); diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 5fee106335..7079671c2e 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -130,6 +130,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()"); ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul); + ops.def( + "swigluoai_and_mul(Tensor! out, Tensor input, float alpha=1.702, float " + "limit=7.0) " + "-> ()"); + ops.impl("swigluoai_and_mul", torch::kCUDA, &swigluoai_and_mul); + // GELU implementation used in GPT-2. ops.def("gelu_new(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_new", torch::kCUDA, &gelu_new); diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py index 29c5e70a8b..ec5c60fd7b 100644 --- a/tests/kernels/core/test_activation.py +++ b/tests/kernels/core/test_activation.py @@ -11,7 +11,7 @@ from tests.kernels.utils import opcheck from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul, GeluAndMul, MulAndSilu, NewGELU, QuickGELU, - SiluAndMul) + SiluAndMul, SwigluOAIAndMul) from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] @@ -25,7 +25,15 @@ CUDA_DEVICES = [ @pytest.mark.parametrize( "activation", - ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"]) + [ + "silu_and_mul", + "mul_and_silu", + "gelu", + "gelu_tanh", + "fatrelu", + "swigluoai_and_mul", + ], +) @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @@ -59,18 +67,43 @@ def test_act_and_mul( threshold = random.uniform(0, 1) layer = FatreluAndMul(threshold) fn = torch.ops._C.fatrelu_and_mul + elif activation == "swigluoai_and_mul": + layer = SwigluOAIAndMul() + fn = torch.ops._C.swigluoai_and_mul out = layer(x) ref_out = layer.forward_native(x) - # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are - # equivalent to the native PyTorch implementations, so we can do exact - # comparison. - torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0) + if activation == "swigluoai_and_mul": + + rtol = { + #For fp16, change the relative tolerance from 1e-3 to 2e-3 + torch.float16: + 2e-3, + torch.bfloat16: + 2e-2, + torch.float: + 1.3e-6 + } + + def _get_rtol(output) -> float: + return rtol[output.dtype] + + torch.testing.assert_close(out, + ref_out, + atol=get_default_atol(out), + rtol=_get_rtol(out)) + else: + # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are + # equivalent to the native PyTorch implementations, so we can do exact + # comparison. + torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0) d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) if activation == "fatrelu": opcheck(fn, (out, x, threshold)) + elif activation == "swigluoai_and_mul": + opcheck(fn, (out, x, layer.alpha, layer.limit)) else: opcheck(fn, (out, x)) diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 7ce44174ea..86ab4f546d 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -239,6 +239,35 @@ class GeluAndMul(CustomOp): return f'approximate={repr(self.approximate)}' +@CustomOp.register("swigluoai_and_mul") +class SwigluOAIAndMul(CustomOp): + # https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110 + def __init__(self, alpha: float = 1.702, limit: float = 7.0): + super().__init__() + self.alpha = alpha + self.limit = limit + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + + gate, up = x[..., ::2], x[..., 1::2] + gate = gate.clamp(min=None, max=self.limit) + up = up.clamp(min=-self.limit, max=self.limit) + glu = gate * torch.sigmoid(gate * self.alpha) + gated_output = (up + 1) * glu + return gated_output + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + torch.ops._C.swigluoai_and_mul(out, x, self.alpha, self.limit) + return out + + def extra_repr(self) -> str: + return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}" + + @CustomOp.register("gelu_new") class NewGELU(CustomOp): @@ -330,6 +359,7 @@ class ReLUSquaredActivation(CustomOp): return torch.square(F.relu(x)) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + #TODO : implement cuda kenrels return self.forward_native(x) @@ -406,9 +436,14 @@ def get_act_fn(act_fn_name: str) -> nn.Module: _ACTIVATION_AND_MUL_REGISTRY = LazyDict({ - "gelu": lambda: GeluAndMul(), - "silu": lambda: SiluAndMul(), - "geglu": lambda: GeluAndMul(), + "gelu": + lambda: GeluAndMul(), + "silu": + lambda: SiluAndMul(), + "geglu": + lambda: GeluAndMul(), + "swigluoai": + lambda *args, **kwargs: SwigluOAIAndMul(*args, **kwargs), }) diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 3c6ece6737..1e3ac6cd79 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -161,25 +161,13 @@ def fused_marlin_moe(hidden_states: torch.Tensor, if activation == "silu": torch.ops._C.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) - elif activation == "swiglu_oai": - # NOTE: in gpt-oss, the gate_proj and up_proj is interleaved - # - interleaved: gate, up = gate_up[..., ::2], gate_up[..., 1::2] - # - origin: gate, up = gate_up[..., :N], gate_up[..., N:] - - @torch.compile(dynamic=True) - def swiglu_oai(gate_up): - alpha = 1.702 - limit = 7.0 - gate, up = gate_up[..., ::2], gate_up[..., 1::2] - gate = gate.clamp(min=None, max=limit) - up = up.clamp(min=-limit, max=limit) - glu = gate * torch.sigmoid(gate * alpha) - return (up + 1) * glu - - intermediate_cache2 = swiglu_oai(intermediate_cache1) + elif activation == "swigluoai": + # alpha = 1.702, limit = 7.0 + torch.ops._C.swigluoai_and_mul(intermediate_cache2, + intermediate_cache1.view(-1, 2 * N)) else: raise ValueError(f"Unsupported activation: {activation}. " - "Only silu and swiglu_oai activations are supported.") + "Only silu and swigluoai activations are supported.") if expert_map is not None: intermediate_cache3.zero_() diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 3579ca22ba..02b7b65f4a 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1621,17 +1621,6 @@ def fused_experts_impl( block_shape=block_shape, B_bias=w1_bias) - # TODO fused kernel - def swiglu_oai(gate_up): - alpha = 1.702 - limit = 7.0 - gate, up = gate_up[..., ::2], gate_up[..., 1::2] - gate = gate.clamp(min=None, max=limit) - up = up.clamp(min=-limit, max=limit) - glu = gate * torch.sigmoid(gate * alpha) - gated_output = (up + 1) * glu - return gated_output - # Activation function with multiplication if activation == "silu" and is_act_and_mul: torch.ops._C.silu_and_mul(intermediate_cache2, @@ -1639,13 +1628,16 @@ def fused_experts_impl( elif activation == "gelu" and is_act_and_mul: torch.ops._C.gelu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) + elif activation == "swigluoai" and is_act_and_mul: + # alpha = 1.702, limit = 7.0 + torch.ops._C.swigluoai_and_mul(intermediate_cache2, + intermediate_cache1.view(-1, N)) # Activation function without multiplication elif activation == "silu": intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N)) elif activation == "gelu": intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N)) - elif activation == "swiglu_oai": - intermediate_cache2 = swiglu_oai(intermediate_cache1.view(-1, N)) + else: raise ValueError(f"Unsupported FusedMoe activation: {activation}, " f"with is_act_and_mul={is_act_and_mul}.") diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index deeb69bcad..48f9cc3737 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -61,14 +61,14 @@ def _can_support_mxfp4(use_grouped_topk: bool = False, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, scoring_func: str = "softmax", - activation: str = "swiglu_oai", + activation: str = "swigluoai", expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None): return not (use_grouped_topk or topk_group or num_expert_group or expert_map or custom_routing_function or e_score_correction_bias or apply_router_weight_on_input - or scoring_func != "softmax" or activation != "swiglu_oai" + or scoring_func != "softmax" or activation != "swigluoai" or expert_load_view or logical_to_physical_map or logical_replica_count) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 7c7712dbe1..2f5d9ddd90 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -159,7 +159,7 @@ class MLPBlock(torch.nn.Module): prefix=f"{prefix}.experts", apply_router_weight_on_input=False, has_bias=True, - activation="swiglu_oai") + activation="swigluoai") def forward(self, x: torch.Tensor) -> torch.Tensor: t = self.norm(x) From fe0411fc6fa32cebeacd3a3aef87a591e7309c45 Mon Sep 17 00:00:00 2001 From: 947132885 <947132885@qq.com> Date: Sun, 17 Aug 2025 16:46:36 +0800 Subject: [PATCH 346/932] [Bugfix] should use stack instead of concat (#22972) Signed-off-by: 947132885 <947132885@qq.com> Signed-off-by: Isotr0py Co-authored-by: Isotr0py --- vllm/model_executor/models/transformers.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 4ec2b683fc..f3b7263ca3 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -694,6 +694,17 @@ class TransformersForCausalLM(TransformersBase): return logits +def flatten_and_concat(x: list[torch.Tensor]) -> torch.Tensor: + """Flatten until a list of tensors can be concatenated then do concat""" + + def _can_concat(x: list[torch.Tensor]): + return len(set(map(lambda _x: _x.shape[1:], x))) == 1 + + if _can_concat(x): + return torch.concat(x) + return flatten_and_concat(flatten_bn(x)) + + @MULTIMODAL_REGISTRY.register_processor( MultiModalProcessor, info=MultiModalProcessingInfo, @@ -766,8 +777,7 @@ class TransformersForMultimodalLM(TransformersForCausalLM, SupportsMultiModal): if isinstance(pixel_values, torch.Tensor): pixel_values = flatten_bn(pixel_values).to(self.dtype) elif is_list_of(pixel_values, torch.Tensor): - pixel_values = flatten_bn(flatten_bn(pixel_values), - concat=True).to(self.dtype) + pixel_values = flatten_and_concat(pixel_values).to(self.dtype) else: raise ValueError( f"Unsupported pixel_values type {type(pixel_values)}. " From 16bff144be6739c9f773968ace0b9cd239f67f19 Mon Sep 17 00:00:00 2001 From: Kevinzz Date: Sun, 17 Aug 2025 16:56:20 +0800 Subject: [PATCH 347/932] [Misc] fix typo in the multimodal doc (#23051) --- docs/features/multimodal_inputs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index cdd32924b5..9d51f9cf52 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -216,7 +216,7 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown from vllm import LLM, SamplingParams from qwen_vl_utils import process_vision_info - model_path = "Qwen/Qwen2.5-VL-3B-Instruct/" + model_path = "Qwen/Qwen2.5-VL-3B-Instruct" video_path = "https://content.pexels.com/videos/free-videos.mp4" llm = LLM( From 292084e72ac553dbe14eb897372617a786322a2a Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Sun, 17 Aug 2025 11:52:04 -0400 Subject: [PATCH 348/932] [BugFix] Fix for IMA in FA3 varlen combine (#22967) Signed-off-by: Lucas Wilkinson --- cmake/external_projects/vllm_flash_attn.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index 4e2a0e4533..49defccbb1 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 2d3b7508f67ad976f781e2042ace676419dd78dd + GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn From c55bc1db26f5e4385c8a2c1b7e6ba8b54ab2e060 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 17 Aug 2025 10:36:46 -0700 Subject: [PATCH 349/932] [Misc] Remove dead return (#23061) Signed-off-by: Woosuk Kwon --- vllm/model_executor/models/qwen2_vl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index f2d438b385..9e2f7ca42b 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1225,7 +1225,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: return [] - return None # The result multimodal_embeddings is tuple of tensors, with each # tensor correspoending to a multimodal data item (image or video). From 6d243efedab9a03348cbd55fe966b62a08d90676 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 17 Aug 2025 12:41:38 -0700 Subject: [PATCH 350/932] [Misc] Convert use_structured_output property into constant (#23060) Signed-off-by: Woosuk Kwon --- vllm/v1/request.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 8b703b6191..4e99a9ccef 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -54,8 +54,7 @@ class Request: time.time() self.status = RequestStatus.WAITING - if sampling_params and sampling_params.guided_decoding is not None: - self.status = RequestStatus.WAITING_FOR_FSM + self.use_structured_output = False self.events: list[EngineCoreEvent] = [] self.stop_reason: Union[int, str, None] = None @@ -63,12 +62,15 @@ class Request: self.kv_transfer_params: Optional[dict[str, Any]] = None if pooling_params is not None: + # Pooling models. self.max_tokens = 1 elif sampling_params is not None: + # Generative models. assert sampling_params.max_tokens is not None self.max_tokens = sampling_params.max_tokens if sampling_params.guided_decoding is not None: self.status = RequestStatus.WAITING_FOR_FSM + self.use_structured_output = True if sampling_params.extra_args is not None: self.kv_transfer_params = \ @@ -192,11 +194,6 @@ class Request: num_tokens = self.mm_positions[input_id].length return num_tokens - @property - def use_structured_output(self) -> bool: - return self.sampling_params is not None and \ - self.sampling_params.guided_decoding is not None - def record_event( self, event_type: EngineCoreEventType, From 21e39436c8062ebbf4a160eebf56d7d303896e68 Mon Sep 17 00:00:00 2001 From: Calvin Chen Date: Mon, 18 Aug 2025 05:45:42 +0800 Subject: [PATCH 351/932] [XPU] fix xpu to set cudagraph batch sizes (#23044) Signed-off-by: calvin chen --- vllm/v1/worker/gpu_model_runner.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4219d9147a..adaa1306f6 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -232,8 +232,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # The convention is different. # self.cudagraph_batch_sizes sorts in ascending order. # The batch sizes in the config are in descending order. - self.cudagraph_batch_sizes = list( - reversed(self.compilation_config.cudagraph_capture_sizes)) + if self.compilation_config.cudagraph_capture_sizes and \ + self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE: + self.cudagraph_batch_sizes = list( + reversed(self.compilation_config.cudagraph_capture_sizes)) # Cache the device properties. self._init_device_properties() From 0fc8fa751a4321d6531467537ff77cf3c1c70260 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Sun, 17 Aug 2025 15:56:07 -0700 Subject: [PATCH 352/932] fix: gptq marlin weight loading failure (#23066) --- vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index bd14ab9ef6..c5d1e01701 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -56,7 +56,7 @@ def get_moe_quant_method( # Dynamic per module/layer rules may override base config override_config(cloned_config, prefix=prefix) - return moe_method_cls(cloned_config) + return moe_method_cls(cloned_config, layer.moe_config) return None From 8ea0c2753a273e24957ab4587c200a3254ebe970 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 17 Aug 2025 18:16:03 -0700 Subject: [PATCH 353/932] [Misc] Minor code cleanup for _get_prompt_logprobs_dict (#23064) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index adaa1306f6..fc320be1c3 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1722,7 +1722,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Compute prompt logprobs if needed. prompt_logprobs_dict = self._get_prompt_logprobs_dict( hidden_states[:num_scheduled_tokens], - scheduler_output, + scheduler_output.num_scheduled_tokens, ) # Get the valid generated tokens. @@ -2064,7 +2064,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def _get_prompt_logprobs_dict( self, hidden_states: torch.Tensor, - scheduler_output: "SchedulerOutput", + num_scheduled_tokens: dict[str, int], ) -> dict[str, Optional[LogprobsTensors]]: num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs if not num_prompt_logprobs_dict: @@ -2077,8 +2077,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # maintainable loop over optimal performance. completed_prefill_reqs = [] for req_id, num_prompt_logprobs in num_prompt_logprobs_dict.items(): - - num_tokens = scheduler_output.num_scheduled_tokens[req_id] + num_tokens = num_scheduled_tokens[req_id] # Get metadata for this request. request = self.requests[req_id] From 7be3a59d8ee7014d6462c258222cbfa8be815831 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Mon, 18 Aug 2025 13:09:08 +0800 Subject: [PATCH 354/932] [Misc] enhance static type hint (#23059) Signed-off-by: Andy Xie --- vllm/v1/worker/lora_model_runner_mixin.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 2fbdee4724..84ed46989e 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -8,6 +8,7 @@ from contextlib import contextmanager from typing import Union import numpy as np +import torch import torch.nn as nn from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig @@ -31,7 +32,8 @@ class LoRAModelRunnerMixin: def load_lora_model(self, model: nn.Module, model_config: ModelConfig, scheduler_config: SchedulerConfig, - lora_config: LoRAConfig, device: str) -> nn.Module: + lora_config: LoRAConfig, + device: torch.device) -> nn.Module: if not supports_lora(model): raise ValueError( From 9f1c6422549d37eee22bfa4dbadaaa91d95e98ba Mon Sep 17 00:00:00 2001 From: double7 <33449816+DoubleVII@users.noreply.github.com> Date: Mon, 18 Aug 2025 13:09:11 +0800 Subject: [PATCH 355/932] [Bugfix] fix Qwen2.5-Omni processor output mapping (#23058) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: double7 <33449816+DoubleVII@users.noreply.github.com> Co-authored-by: 杨森 Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/models/qwen2_5_omni_thinker.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index e95295c318..59411eb750 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -88,6 +88,11 @@ def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str, torch.Tensor]): video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3))) video_grid_sizes = video_grid_thw.prod(-1) + # vllm use `second_per_grid_ts` to compute multimodal rotary embedding + video_second_per_grid = hf_inputs.get("video_second_per_grid", None) + if video_second_per_grid is not None: + hf_inputs["second_per_grid_ts"] = video_second_per_grid + return dict( input_audio_features=MultiModalFieldConfig.flat_from_sizes( "audio", audio_feature_lengths, dim=1), From b2fd0b81e065c677ceebecb9a0e1ee6f226b7cec Mon Sep 17 00:00:00 2001 From: Andy Lo Date: Mon, 18 Aug 2025 07:10:26 +0200 Subject: [PATCH 356/932] [Bugfix][CI] Machete kernels: deterministic ordering for more cache hits (#23055) Signed-off-by: Andy Lo --- csrc/quantization/machete/generate.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index 9af7833d09..88b3f9c734 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -349,9 +349,12 @@ def to_cute_constant(value: list[int]): def unique_schedules(impl_configs: list[ImplConfig]): - return list( - set(sch for impl_config in impl_configs - for sch in impl_config.schedules)) + # Use dict over set for deterministic ordering + return list({ + sch: None + for impl_config in impl_configs + for sch in impl_config.schedules + }.keys()) def unsigned_type_with_bitwidth(num_bits): From 08d5f7113a024818b2867782c2539794b7aa162b Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Mon, 18 Aug 2025 13:16:21 +0800 Subject: [PATCH 357/932] [Misc] refactor function name (#23029) Signed-off-by: Andy Xie --- vllm/platforms/cpu.py | 2 +- vllm/v1/worker/cpu_worker.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 0b16a8e1d1..fe258f76b9 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -268,7 +268,7 @@ class CpuPlatform(Platform): DEFAULT_MAX_NUM_BATCHED_TOKENS) @classmethod - def get_allowed_cpu_memory_node_list( + def get_allowed_cpu_core_node_list( cls) -> tuple[list[int], list[LogicalCPUInfo]]: assert platform.system() == "Linux" diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index 2dc28d9304..f83d680484 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -132,7 +132,7 @@ class CPUWorker(Worker): """ allowed_numa_nodes, logical_cpu_list = \ - CpuPlatform.get_allowed_cpu_memory_node_list() + CpuPlatform.get_allowed_cpu_core_node_list() assert len(allowed_numa_nodes) >= self.parallel_config.world_size, ( f"No enough allowed NUMA nodes to bind threads of " f"{self.parallel_config.world_size} CPUWorkers. " From 89657a557c6831cca9fa5e59822af0cf27d67a98 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 17 Aug 2025 23:33:29 -0700 Subject: [PATCH 358/932] [Misc] Fix backward compatibility from #23030 (#23070) Signed-off-by: Roger Wang Co-authored-by: Roger Wang --- vllm/multimodal/base.py | 9 ++++++--- vllm/multimodal/inputs.py | 6 +++--- vllm/sequence.py | 4 +++- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index ef8f1b2e17..c4bb8d56ce 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar if TYPE_CHECKING: from vllm.sequence import SequenceGroupMetadata -from .inputs import MultiModalKwargs, PlaceholderRange +from .inputs import MultiModalKwargs, NestedTensors, PlaceholderRange _T = TypeVar("_T") @@ -56,7 +56,8 @@ class MultiModalPlaceholderMap: @classmethod def from_seq_group( cls, seq_group: "SequenceGroupMetadata", positions: range - ) -> tuple[MultiModalKwargs, dict[str, "MultiModalPlaceholderMap"]]: + ) -> tuple[dict[str, NestedTensors], dict[str, + "MultiModalPlaceholderMap"]]: """ Returns the multi-modal items that intersect with the portion of a prompt (``seq_group``) represented by ``positions``, as well as a @@ -99,7 +100,7 @@ class MultiModalPlaceholderMap: seq_mm_placeholders = seq_group.multi_modal_placeholders if not seq_mm_data or not seq_mm_placeholders: - return MultiModalKwargs(), {} + return MultiModalKwargs().get_data(), {} placeholder_maps = dict[str, MultiModalPlaceholderMap]() @@ -116,6 +117,8 @@ class MultiModalPlaceholderMap: placeholder_maps[modality] = placeholder_map + seq_mm_data = seq_mm_data if isinstance( + seq_mm_data, dict) else seq_mm_data.get_data() return seq_mm_data, placeholder_maps def append_items_from_seq_group( diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index d3f57cf533..3e0bfce59c 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -664,7 +664,7 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): def modality(self) -> str: return self._modality - def get_data(self) -> Mapping[str, NestedTensors]: + def get_data(self) -> dict[str, NestedTensors]: return {key: elem.data for key, elem in self.items()} @@ -720,7 +720,7 @@ class MultiModalKwargs: items_by_modality = full_groupby(items, key=lambda x: x.modality) self._items_by_modality = dict(items_by_modality) - self._data: Optional[Mapping[str, NestedTensors]] = None + self._data: Optional[dict[str, NestedTensors]] = None @property def modalities(self): @@ -883,7 +883,7 @@ class MultiModalKwargs: def get_data(self, *, - pin_memory: bool = False) -> Mapping[str, NestedTensors]: + pin_memory: bool = False) -> dict[str, NestedTensors]: if self._data is not None: return self._data diff --git a/vllm/sequence.py b/vllm/sequence.py index b3be10b6bb..2cb254381e 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -22,6 +22,7 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import RequestOutputKind, SamplingParams if TYPE_CHECKING: + from vllm.multimodal.inputs import NestedTensors from vllm.v1.worker.kv_connector_model_runner_mixin import ( KVConnectorOutput) @@ -978,7 +979,8 @@ class SequenceGroupMetadata( state: Optional[SequenceGroupState] = msgspec.field( default_factory=lambda: SequenceGroupState()) token_type_ids: Optional[list[int]] = None - multi_modal_data: Optional[MultiModalKwargs] = None + multi_modal_data: Optional[Union[MultiModalKwargs, + dict[str, "NestedTensors"]]] = None multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None encoder_seq_data: Optional[SequenceData] = None cross_block_table: Optional[list[int]] = None From 5f5664b3e4ff8046e26c36165a1294205cb429c5 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Mon, 18 Aug 2025 15:04:08 +0800 Subject: [PATCH 359/932] [XPU] Fix compile size for xpu (#23069) Signed-off-by: Kunshang Ji --- vllm/config/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 51db277f65..cd2be212c2 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3548,7 +3548,7 @@ class VllmConfig: if self.compilation_config.pass_config.enable_sequence_parallelism: self.compilation_config.custom_ops.append("+rms_norm") - if current_platform.is_cuda_alike(): + if current_platform.is_cuda_alike() or current_platform.is_xpu(): # if cudagraph_mode is not explicitly set by users, set default # value if self.compilation_config.cudagraph_mode is None: From 5c79b0d6484d7d4c5fe007c3c7ad04c72d3bc59e Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Mon, 18 Aug 2025 17:47:03 +0800 Subject: [PATCH 360/932] [XPU][CI]add xpu env vars in CI scripts (#22946) Signed-off-by: Kunshang Ji --- .buildkite/scripts/hardware_ci/run-xpu-test.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index deb61a9baf..445cd2735c 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -23,9 +23,13 @@ docker run \ --device /dev/dri \ -v /dev/dri/by-path:/dev/dri/by-path \ --entrypoint="" \ + -e "HF_TOKEN=${HF_TOKEN}" \ + -e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \ --name "${container_name}" \ "${image_name}" \ - sh -c ' + bash -c ' + set -e + echo $ZE_AFFINITY_MASK VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp @@ -35,8 +39,8 @@ docker run \ pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/structured_output - pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py - pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py + pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py + pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py pytest -v -s v1/test_serial_utils.py pytest -v -s v1/test_utils.py pytest -v -s v1/test_metrics_reader.py From 27e8d1ea3ea9864f371f639daaa5315bf3250364 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 18 Aug 2025 17:52:00 +0800 Subject: [PATCH 361/932] [Refactor] Define MultiModalKwargsItems separate from MultiModalKwargs (#23053) Signed-off-by: DarkLight1337 --- docs/api/README.md | 1 + docs/contributing/model/multimodal.md | 4 +- .../multimodal/processing/test_common.py | 14 +- .../multimodal/processing/test_glm4_1v.py | 3 +- .../multimodal/processing/test_h2ovl.py | 3 +- .../multimodal/processing/test_internvl.py | 3 +- .../multimodal/processing/test_llama4.py | 10 +- .../multimodal/processing/test_mllama.py | 6 +- .../multimodal/processing/test_mllama4.py | 10 +- .../multimodal/processing/test_nemotron_vl.py | 3 +- .../multimodal/processing/test_qwen2_vl.py | 3 +- tests/models/multimodal/test_tensor_schema.py | 2 +- tests/multimodal/test_cache.py | 11 +- tests/v1/test_serial_utils.py | 22 ++- vllm/executor/msgspec_utils.py | 9 +- vllm/model_executor/models/aria.py | 4 +- vllm/model_executor/models/aya_vision.py | 4 +- vllm/model_executor/models/blip2.py | 4 +- vllm/model_executor/models/chameleon.py | 4 +- vllm/model_executor/models/cohere2_vision.py | 4 +- vllm/model_executor/models/deepseek_vl2.py | 7 +- vllm/model_executor/models/florence2.py | 4 +- vllm/model_executor/models/fuyu.py | 4 +- vllm/model_executor/models/gemma3_mm.py | 4 +- vllm/model_executor/models/gemma3n_mm.py | 4 +- vllm/model_executor/models/glm4_1v.py | 10 +- vllm/model_executor/models/glm4v.py | 4 +- vllm/model_executor/models/granite_speech.py | 4 +- vllm/model_executor/models/h2ovl.py | 16 +- .../models/hyperclovax_vision.py | 27 +-- vllm/model_executor/models/idefics3.py | 4 +- vllm/model_executor/models/interns1.py | 13 +- vllm/model_executor/models/internvl.py | 34 ++-- vllm/model_executor/models/keye.py | 7 +- vllm/model_executor/models/kimi_vl.py | 4 +- vllm/model_executor/models/llava.py | 6 +- .../model_executor/models/llava_next_video.py | 4 +- vllm/model_executor/models/llava_onevision.py | 4 +- vllm/model_executor/models/minicpmo.py | 4 +- vllm/model_executor/models/minicpmv.py | 4 +- vllm/model_executor/models/mistral3.py | 4 +- vllm/model_executor/models/mllama.py | 7 +- vllm/model_executor/models/mllama4.py | 12 +- vllm/model_executor/models/molmo.py | 4 +- vllm/model_executor/models/nvlm_d.py | 13 +- vllm/model_executor/models/ovis.py | 9 +- vllm/model_executor/models/paligemma.py | 4 +- vllm/model_executor/models/phi3v.py | 4 +- vllm/model_executor/models/phi4_multimodal.py | 4 +- vllm/model_executor/models/phi4mm.py | 4 +- vllm/model_executor/models/pixtral.py | 7 +- .../models/prithvi_geospatial_mae.py | 7 +- .../models/qwen2_5_omni_thinker.py | 15 +- vllm/model_executor/models/qwen2_audio.py | 7 +- vllm/model_executor/models/qwen2_vl.py | 7 +- vllm/model_executor/models/qwen_vl.py | 4 +- vllm/model_executor/models/skyworkr1v.py | 13 +- vllm/model_executor/models/step3_vl.py | 14 +- vllm/model_executor/models/tarsier.py | 4 +- vllm/model_executor/models/transformers.py | 6 +- vllm/model_executor/models/ultravox.py | 9 +- vllm/model_executor/models/voxtral.py | 7 +- vllm/model_executor/models/whisper.py | 4 +- vllm/multimodal/__init__.py | 4 +- vllm/multimodal/base.py | 9 +- vllm/multimodal/cache.py | 21 ++- vllm/multimodal/inputs.py | 172 ++++++++---------- vllm/multimodal/parse.py | 11 +- vllm/multimodal/processing.py | 38 ++-- vllm/multimodal/profiling.py | 4 +- vllm/multimodal/utils.py | 25 ++- vllm/sequence.py | 6 +- vllm/v1/engine/processor.py | 2 +- vllm/v1/serial_utils.py | 41 ++++- vllm/v1/worker/gpu_input_batch.py | 10 +- vllm/v1/worker/gpu_model_runner.py | 5 +- vllm/v1/worker/tpu_model_runner.py | 5 +- 77 files changed, 431 insertions(+), 383 deletions(-) diff --git a/docs/api/README.md b/docs/api/README.md index 327472df1d..57142e8f56 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -77,6 +77,7 @@ Internal data structures. - [vllm.multimodal.inputs.MultiModalFieldElem][] - [vllm.multimodal.inputs.MultiModalFieldConfig][] - [vllm.multimodal.inputs.MultiModalKwargsItem][] +- [vllm.multimodal.inputs.MultiModalKwargsItems][] - [vllm.multimodal.inputs.MultiModalKwargs][] - [vllm.multimodal.inputs.MultiModalInputs][] diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 64a48be326..76d0f067fd 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -629,7 +629,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index @@ -778,7 +778,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() bos_token_id = hf_config.bos_token_id diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 906966ddd0..a1744317b3 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -370,10 +370,16 @@ def _assert_inputs_equal( if ignore_mm_keys is None: ignore_mm_keys = set() - assert "mm_kwargs" in a and "mm_kwargs" in b, msg + a_rest = {k: v for k, v in a.items() if k != "mm_kwargs"} + b_rest = {k: v for k, v in b.items() if k != "mm_kwargs"} + + assert a_rest == b_rest, msg + + a_data = a["mm_kwargs"].get_data() + b_data = b["mm_kwargs"].get_data() for key in ignore_mm_keys: - a["mm_kwargs"].pop(key, None) - b["mm_kwargs"].pop(key, None) + a_data.pop(key, None) + b_data.pop(key, None) - assert a == b, msg + assert a_data == b_data, msg diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py index a6d900ec5d..a49842e109 100644 --- a/tests/models/multimodal/processing/test_glm4_1v.py +++ b/tests/models/multimodal/processing/test_glm4_1v.py @@ -45,7 +45,8 @@ def test_processor_override( video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token) video_tok_count = processed_inputs["prompt_token_ids"].count( video_token_id) - grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0] + grid_t, _, _ = processed_inputs["mm_kwargs"].get_data( + )["video_grid_thw"][0] assert grid_t == expected_grid_t assert video_tok_count == expected_toks_per_frame * grid_t diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py index 76e4acc67d..1adfe21352 100644 --- a/tests/models/multimodal/processing/test_h2ovl.py +++ b/tests/models/multimodal/processing/test_h2ovl.py @@ -108,7 +108,8 @@ def _run_check( # Ensure we have the right number of placeholders per num_crops size image_token_id = tokenizer.convert_tokens_to_ids("") img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) - pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape + pixel_shape = processed_inputs["mm_kwargs"].get_data( + )["pixel_values_flat"].shape assert img_tok_count == 256 * total_expected_num_patches assert pixel_shape[0] == total_expected_num_patches diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py index c3e2841a8f..e4f25f5ac7 100644 --- a/tests/models/multimodal/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -68,7 +68,8 @@ def _run_check( # Ensure we have the right number of placeholders per num_crops size image_token_id = tokenizer.convert_tokens_to_ids("") img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) - pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape + pixel_shape = processed_inputs["mm_kwargs"].get_data( + )["pixel_values_flat"].shape assert img_tok_count == 256 * total_expected_num_patches assert pixel_shape[0] == total_expected_num_patches diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py index 5e14f0f996..bea4f43567 100644 --- a/tests/models/multimodal/processing/test_llama4.py +++ b/tests/models/multimodal/processing/test_llama4.py @@ -51,14 +51,14 @@ def test_processor_override( prompt = encode_tokens(tokenizer, prompt) processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) - mm_kwargs = processed_inputs["mm_kwargs"] + mm_data = processed_inputs["mm_kwargs"].get_data() # place holder replacements prompt_token_ids = processed_inputs["prompt_token_ids"] assert prompt_token_ids.count(config.boi_token_index) == num_imgs assert prompt_token_ids.count(config.eoi_token_index) == num_imgs assert prompt_token_ids.count(vocab[hf_processor.image_token]) == num_imgs - aspect_ratios = mm_kwargs["aspect_ratios"] + aspect_ratios = mm_data["aspect_ratios"] num_x_separators = num_y_separators = 0 for tiles_y, tiles_x in aspect_ratios: if tiles_x * tiles_y > 1: @@ -80,6 +80,6 @@ def test_processor_override( num_patches_per_chunk = processor.info.get_patch_per_chunk( config.vision_config) assert prompt_token_ids.count(config.image_token_index) \ - == mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk - assert mm_kwargs["pixel_values"].shape[0] \ - == mm_kwargs["patches_per_image"].sum() + == sum(mm_data["patches_per_image"]) * num_patches_per_chunk + assert len(mm_data["pixel_values"]) \ + == sum(mm_data["patches_per_image"]) diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py index a6b20a1e36..b42d3f89f3 100644 --- a/tests/models/multimodal/processing/test_mllama.py +++ b/tests/models/multimodal/processing/test_mllama.py @@ -49,18 +49,18 @@ def test_profiling( encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids) ] * max_num_seqs - mm_kwargs = processor.apply( + mm_data = processor.apply( prompt=dummy_mm_data.prompt, mm_data=dummy_mm_data.mm_data, hf_processor_mm_kwargs=dict(), - )["mm_kwargs"] + )["mm_kwargs"].get_data() # Get the actual number of encoder tokens for each sample. # Because attn_metadata.encoder_seq_lens only counts the last # group of images for each sample, which is used to cheat the # block manager to allocate blocks for those images only. # See MllamaMultiModalProcessor for more details. - num_tiles = [[t] for t in mm_kwargs.pop("num_tiles")] + num_tiles = [[t] for t in mm_data.pop("num_tiles")] num_tokens_per_tile = calc_token_per_chunk(image_size) actual_encoder_seq_lens = [ sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py index f3871b60c3..3be77b5da6 100644 --- a/tests/models/multimodal/processing/test_mllama4.py +++ b/tests/models/multimodal/processing/test_mllama4.py @@ -38,21 +38,21 @@ def test_profiling(model_id: str, max_model_len: int): hf_config = ctx.get_hf_config(Llama4Config) - mm_kwargs = processor.apply( + mm_data = processor.apply( prompt=dummy_mm_data.prompt, mm_data=dummy_mm_data.mm_data, hf_processor_mm_kwargs=dict(), - )["mm_kwargs"] + )["mm_kwargs"].get_data() image_size = hf_config.vision_config.image_size patch_size = hf_config.vision_config.patch_size downsample_ratio = int( round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2))) tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio - chunks_per_image = prod(mm_kwargs["patches_per_image"]) + chunks_per_image = prod(mm_data["patches_per_image"]) total_num_patches = chunks_per_image * tokens_per_patch - num_tiles = mm_kwargs["aspect_ratios"][0][0] * mm_kwargs["aspect_ratios"][ - 0][1] # x-y seperator tokens + num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][ + 1] # x-y seperator tokens total_tokens = total_num_patches.item() + num_tiles.item( ) + 3 # image start, image, image end diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py index 6fbbab0d26..d9f1965a05 100644 --- a/tests/models/multimodal/processing/test_nemotron_vl.py +++ b/tests/models/multimodal/processing/test_nemotron_vl.py @@ -70,7 +70,8 @@ def _run_check( # Ensure we have the right number of placeholders per num_crops size image_token_id = tokenizer.convert_tokens_to_ids("") img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) - pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape + pixel_shape = processed_inputs["mm_kwargs"].get_data( + )["pixel_values_flat"].shape print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape) assert img_tok_count == 256 * total_expected_num_patches assert pixel_shape[0] == total_expected_num_patches diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py index 9d1cd18338..985f4188fd 100644 --- a/tests/models/multimodal/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -48,7 +48,8 @@ def test_processor_override( hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs) image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token) img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) - pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape + pixel_shape = processed_inputs["mm_kwargs"].get_data( + )["pixel_values"].shape assert img_tok_count == expected_toks_per_img * num_imgs assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py index 036624431c..51e5b84b6c 100644 --- a/tests/models/multimodal/test_tensor_schema.py +++ b/tests/models/multimodal/test_tensor_schema.py @@ -128,7 +128,7 @@ def create_batched_mm_kwargs( )["mm_kwargs"] items = [ item for modality in supported_mm_limits - for item in mm_kwargs.get_items(modality) + for item in mm_kwargs[modality] ] return group_mm_kwargs_by_modality(items) diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py index 2149f05b6a..088cd00db2 100644 --- a/tests/multimodal/test_cache.py +++ b/tests/multimodal/test_cache.py @@ -4,8 +4,8 @@ import pytest import torch from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata -from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs, - MultiModalKwargsItem, +from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem, + MultiModalKwargsItems, MultiModalSharedField) @@ -24,8 +24,8 @@ def _dummy_item(modality: str, size_by_key: dict[str, int]): ]) -def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]): - return MultiModalKwargs([ +def _dummy_items(size_by_key_modality: dict[str, dict[str, int]]): + return MultiModalKwargsItems.from_seq([ _dummy_item(modality, size_by_key) for modality, size_by_key in size_by_key_modality.items() ]) @@ -37,7 +37,8 @@ def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]): [ (_dummy_item("a", {"a1": 100}), 100), (_dummy_item("a", {"a1": 100, "a2": 110}), 210), - (_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501 + (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501 + (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}).get_data(), 460), # noqa: E501 ], ) # yapf: enable diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py index 586276ee08..118b40d0ef 100644 --- a/tests/v1/test_serial_utils.py +++ b/tests/v1/test_serial_utils.py @@ -11,7 +11,8 @@ import torch from vllm.multimodal.inputs import (MultiModalBatchedField, MultiModalFieldElem, MultiModalFlatField, - MultiModalKwargs, MultiModalKwargsItem, + MultiModalKwargsItem, + MultiModalKwargsItems, MultiModalSharedField, NestedTensors) from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder @@ -96,7 +97,7 @@ def test_encode_decode(monkeypatch: pytest.MonkeyPatch): class MyRequest(msgspec.Struct): - mm: Optional[list[MultiModalKwargs]] + mm: Optional[list[MultiModalKwargsItems]] def test_multimodal_kwargs(): @@ -119,7 +120,7 @@ def test_multimodal_kwargs(): audio = MultiModalKwargsItem.from_elems([e1]) video = MultiModalKwargsItem.from_elems([e2]) image = MultiModalKwargsItem.from_elems([e3, e4]) - mm = MultiModalKwargs([audio, video, image]) + mm = MultiModalKwargsItems.from_seq([audio, video, image]) # pack mm kwargs into a mock request so that it can be decoded properly req = MyRequest([mm]) @@ -133,19 +134,22 @@ def test_multimodal_kwargs(): total_len = sum(memoryview(x).cast("B").nbytes for x in encoded) - # expected total encoding length, should be 14255, +-20 for minor changes - assert 14250 <= total_len <= 14300 - decoded: MultiModalKwargs = decoder.decode(encoded).mm[0] + # expected total encoding length, should be 14306, +-20 for minor changes + assert 14275 <= total_len <= 14325 + decoded = decoder.decode(encoded).mm[0] + assert isinstance(decoded, MultiModalKwargsItems) # check all modalities were recovered and do some basic sanity checks - assert len(decoded.modalities) == 3 - images = decoded.get_items("image") + assert len(decoded) == 3 + images = decoded["image"] assert len(images) == 1 assert len(images[0].items()) == 2 assert list(images[0].keys()) == ["i0", "i1"] # check the tensor contents and layout in the main dict - assert all(nested_equal(mm[k], decoded[k]) for k in mm) + mm_data = mm.get_data() + decoded_data = decoded.get_data() + assert all(nested_equal(mm_data[k], decoded_data[k]) for k in mm_data) def nested_equal(a: NestedTensors, b: NestedTensors): diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py index 852c8f5cff..4ce6d8dfad 100644 --- a/vllm/executor/msgspec_utils.py +++ b/vllm/executor/msgspec_utils.py @@ -4,11 +4,12 @@ from array import array from typing import Any, Type +from vllm.multimodal.inputs import MultiModalKwargs from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE def encode_hook(obj: Any) -> Any: - """Custom msgspec enc hook that supports array types. + """Custom msgspec enc hook that supports array types and MultiModalKwargs. See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder """ @@ -17,10 +18,12 @@ def encode_hook(obj: Any) -> Any: f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. " f"Given array has a type code of {obj.typecode}.") return obj.tobytes() + if isinstance(obj, MultiModalKwargs): + return dict(obj) def decode_hook(type: Type, obj: Any) -> Any: - """Custom msgspec dec hook that supports array types. + """Custom msgspec dec hook that supports array types and MultiModalKwargs. See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder """ @@ -28,3 +31,5 @@ def decode_hook(type: Type, obj: Any) -> Any: deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE) deserialized.frombytes(obj) return deserialized + if type is MultiModalKwargs: + return MultiModalKwargs(obj) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index e1368a3f64..1c7960fa3e 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -22,7 +22,7 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, @@ -470,7 +470,7 @@ class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]): self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 5cd74bbba4..b02a973d94 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -18,7 +18,7 @@ from transformers.models.got_ocr2.image_processing_got_ocr2 import ( from vllm.config import VllmConfig from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs +from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -242,7 +242,7 @@ class AyaVisionMultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_token = hf_processor.image_token diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 8e3505f872..2f2b880bb0 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -15,7 +15,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptIndexTargets, @@ -492,7 +492,7 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]): self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: tokenizer = self.info.get_tokenizer() vocab = tokenizer.get_vocab() diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 8d705f40ce..e6914ad4c4 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -31,7 +31,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, @@ -151,7 +151,7 @@ class ChameleonMultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) tokenizer = self.info.get_tokenizer() diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index f17583768f..bc526fd661 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -21,7 +21,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs +from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -241,7 +241,7 @@ class Cohere2VisionMultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_token = hf_processor.image_token diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index e0acca75d9..e881e9c6dd 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -21,7 +21,7 @@ from vllm.model_executor.model_loader.utils import set_default_torch_dtype from vllm.model_executor.models.transformers import replace_linear_class from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs, NestedTensors) + MultiModalKwargsItems, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -252,7 +252,7 @@ class DeepseekVL2MultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) @@ -291,7 +291,8 @@ class DeepseekVL2MultiModalProcessor( tokenization_kwargs: Mapping[str, object], *, return_mm_hashes: bool, - ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: + ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes], + bool]: # The processor logic is different for len(images) <= 2 vs > 2 # Since the processing cache assumes that the processor output is # invariant of how many images are passed per prompt, we only diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index 56e456c2f1..4a8cb35a54 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -21,7 +21,7 @@ from vllm.model_executor.models.bart import (BartDecoder, BartEncoder, from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseProcessingInfo, EncDecMultiModalProcessor, @@ -860,7 +860,7 @@ class Florence2MultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() pad_token_id = hf_config.pad_token_id diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index b61e0361fe..90af859ab9 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -32,7 +32,7 @@ from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -226,7 +226,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]): self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() bos_token_id = hf_config.bos_token_id diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 9871b11b37..bf5ad633b9 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -17,7 +17,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) # yapf: disable @@ -311,7 +311,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]): self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_token = hf_processor.boi_token diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index a0c3bb5007..79061fd30c 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -24,7 +24,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems, MultiModalDataParser) # yapf: disable @@ -209,7 +209,7 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo] self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 88c53c8363..015577322f 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -59,7 +59,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs, VideoItem) + MultiModalKwargsItems, VideoItem) from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -1158,7 +1158,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_processor = self.info.get_image_processor( @@ -1175,14 +1175,16 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): merge_length = image_processor.merge_size**2 def get_image_replacement_glm4v(item_idx: int): - grid_thw = out_mm_kwargs["image_grid_thw"][item_idx] + out_item = out_mm_kwargs["image"][item_idx] + grid_thw = out_item["image_grid_thw"].data assert isinstance(grid_thw, torch.Tensor) num_tokens = int(grid_thw.prod()) // merge_length return [hf_processor.image_token_id] * num_tokens def get_video_replacement_glm4v(item_idx: int): - grid_thw = out_mm_kwargs["video_grid_thw"][item_idx] + out_item = out_mm_kwargs["video"][item_idx] + grid_thw = out_item["video_grid_thw"].data assert isinstance(grid_thw, torch.Tensor) video, metadata = mm_items["video"][item_idx] diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 1751fccd08..bf33575859 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -30,7 +30,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, @@ -503,7 +503,7 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]): self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index c9e3b74e7c..c3ac3bb78c 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -40,7 +40,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -118,7 +118,7 @@ class GraniteSpeechMultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> list[PromptUpdate]: processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) tokenizer = self.info.get_tokenizer() diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index c3e4f81597..9ab3f4d0d9 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -17,7 +17,7 @@ from transformers import PretrainedConfig from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalKwargs +from vllm.multimodal.inputs import MultiModalKwargsItems from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, MultiModalDataItems) from vllm.multimodal.processing import (MultiModalHashes, PromptReplacement, @@ -425,18 +425,19 @@ class H2OVLMultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - if "image_num_patches" in out_mm_kwargs: - image_num_patches = out_mm_kwargs["image_num_patches"] + out_mm_data = out_mm_kwargs.get_data() + if "image_num_patches" in out_mm_data: + image_num_patches = out_mm_data["image_num_patches"] assert isinstance(image_num_patches, torch.Tensor) image_num_patches = image_num_patches.tolist() - elif "image_embeds" in out_mm_kwargs: + elif "image_embeds" in out_mm_data: # TODO: Use image size information in dictionary embedding inputs # to compute num_patches (similar to Qwen2-VL) - image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) + image_num_patches = [None] * len(out_mm_data["image_embeds"]) else: image_num_patches = [] @@ -479,7 +480,8 @@ class H2OVLMultiModalProcessor( tokenization_kwargs: Mapping[str, object], *, return_mm_hashes: bool, - ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: + ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes], + bool]: # The processor logic is different for len(images) <= 1 vs > 1 # Since the processing cache assumes that the processor output is # invariant of how many images are passed per prompt, we only diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index e5c94c7f3a..d3ddc47ea9 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -34,7 +34,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import ImageSize, MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, ProcessingCache, @@ -295,7 +295,7 @@ class HCXVisionMultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() placeholder = { @@ -306,21 +306,22 @@ class HCXVisionMultiModalProcessor( def get_replacement_hyperclovax( item_idx: int, modality: str, - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ): - num_tokens = None + out_item = out_mm_kwargs[modality][item_idx] + if modality == "image": + lens = out_item["vision_query_lengths_images"].data num_tokens = self.info.get_num_image_tokens( - vision_query_length=out_mm_kwargs[ - "vision_query_lengths_images"][item_idx], ) - if modality == "video": + vision_query_length=lens) + elif modality == "video": + lens = out_item["vision_query_lengths_videos"].data num_tokens = self.info.get_num_video_tokens( - vision_query_length=out_mm_kwargs[ - "vision_query_lengths_videos"][item_idx], ) - assert isinstance(num_tokens, int) - return [ - placeholder[modality], - ] * num_tokens + vision_query_length=lens) + else: + raise NotImplementedError(modality) + + return [placeholder[modality]] * num_tokens return [ PromptReplacement( diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 3c01789b90..63307470d9 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -34,7 +34,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import ImageProcessorItems, ImageSize # yapf conflicts with isort for this block # yapf: disable @@ -374,7 +374,7 @@ class Idefics3MultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_token, _, _ = self.info._get_image_token(hf_processor) diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index d952ced2fa..c739e74b05 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -24,7 +24,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs, NestedTensors) + MultiModalKwargsItems, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -399,7 +399,7 @@ class InternS1MultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) img_context_token = hf_processor.image_token @@ -407,15 +407,16 @@ class InternS1MultiModalProcessor( end_image_token = hf_processor.end_image_token video_token = hf_processor.video_token - if "video_num_patches" in out_mm_kwargs: - video_num_patches = out_mm_kwargs["video_num_patches"] + out_mm_data = out_mm_kwargs.get_data() + if "video_num_patches" in out_mm_data: + video_num_patches = out_mm_data["video_num_patches"] assert isinstance(video_num_patches, torch.Tensor) video_num_patches = video_num_patches.tolist() else: video_num_patches = [] - if "image_num_patches" in out_mm_kwargs: - image_num_patches = out_mm_kwargs["image_num_patches"] + if "image_num_patches" in out_mm_data: + image_num_patches = out_mm_data["image_num_patches"] assert isinstance(image_num_patches, torch.Tensor) image_num_patches = image_num_patches.tolist() else: diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 8e766dd4c4..da8ad83967 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -28,7 +28,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs, NestedTensors) + MultiModalKwargsItems, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -797,18 +797,19 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]): self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - if "image_num_patches" in out_mm_kwargs: - image_num_patches = out_mm_kwargs["image_num_patches"] + out_mm_data = out_mm_kwargs.get_data() + if "image_num_patches" in out_mm_data: + image_num_patches = out_mm_data["image_num_patches"] assert isinstance(image_num_patches, torch.Tensor) image_num_patches = image_num_patches.tolist() - elif "image_embeds" in out_mm_kwargs: + elif "image_embeds" in out_mm_data: # TODO: Use image size information in dictionary embedding inputs # to compute num_patches (similar to Qwen2-VL) - image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) + image_num_patches = [None] * len(out_mm_data["image_embeds"]) else: image_num_patches = [] @@ -966,15 +967,19 @@ class InternVLMultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: - prompt_repl: list[PromptUpdate] = super()._get_prompt_updates( - mm_items, hf_processor_mm_kwargs, out_mm_kwargs) + prompt_repl = super()._get_prompt_updates( + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + out_mm_kwargs=out_mm_kwargs, + ) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - if "video_num_patches" in out_mm_kwargs: - video_num_patches = out_mm_kwargs["video_num_patches"] + out_mm_data = out_mm_kwargs.get_data() + if "video_num_patches" in out_mm_data: + video_num_patches = out_mm_data["video_num_patches"] assert isinstance(video_num_patches, torch.Tensor) video_num_patches = video_num_patches.tolist() else: @@ -992,12 +997,15 @@ class InternVLMultiModalProcessor( video_context_token=hf_processor.video_token) if self.info.supports_video: - prompt_repl.append( + prompt_repl = [ + *prompt_repl, PromptReplacement( modality="video", target="